提交 c396d611 authored 作者: sentient07's avatar sentient07

Added print_profile to GraphtoGPU

上级 de536bd5
...@@ -3,14 +3,17 @@ import copy ...@@ -3,14 +3,17 @@ import copy
import numpy import numpy
import logging import logging
import pdb import pdb
import time
from six.moves import xrange from six.moves import xrange
import theano import theano
from theano.compat import OrderedDict
from theano import tensor, scalar, gof, config from theano import tensor, scalar, gof, config
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer, from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, DB, toolbox) SequenceDB, Optimizer, DB, toolbox, graph)
from gof.opt import ChangeTracker
from theano.gof.optdb import LocalGroupDB from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse from theano.ifelse import IfElse
...@@ -251,7 +254,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), ...@@ -251,7 +254,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge') 0, 'fast_run', 'fast_compile', 'merge')
class GraphToGPU(Optimizer): class GraphToGPU(Optimizer, NavigatorOptimizer):
""" """
Transfer the graph as a whole to GPU instead of transfering node by node. Transfer the graph as a whole to GPU instead of transfering node by node.
""" """
...@@ -264,8 +267,18 @@ class GraphToGPU(Optimizer): ...@@ -264,8 +267,18 @@ class GraphToGPU(Optimizer):
fgraph.attach_feature(toolbox.ReplaceValidate()) fgraph.attach_feature(toolbox.ReplaceValidate())
def apply(self, fgraph): def apply(self, fgraph):
change_tracker = ChangeTracker()
mapping = {} mapping = {}
global_process_count = {}
start_nb_nodes = len(fgraph.apply_nodes)
max_nb_nodes = len(fgraph.apply_nodes)
loop_timing = []
loop_process_count = []
local_opt_timing = []
io_toposort_timing = []
nb_nodes = []
node_created = {}
process_count = {}
# Building a new graph # Building a new graph
# Iterating through inputs of graph # Iterating through inputs of graph
for i in fgraph.inputs: for i in fgraph.inputs:
...@@ -277,8 +290,23 @@ class GraphToGPU(Optimizer): ...@@ -277,8 +290,23 @@ class GraphToGPU(Optimizer):
if isinstance(i, theano.Constant): if isinstance(i, theano.Constant):
mapping[i] = i mapping[i] = i
for lopt in (self.local_optimizers_all +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_map.get(node.op, [])):
process_count.setdefault(copt, 0)
global_process_count.setdefault(opt, 0)
time_opts.setdefault(opt, 0)
node_created.setdefault(opt, 0)
topo_t0 = time.time()
q = deque(graph.io_toposort(fgraph.inputs, start_from))
io_toposort_timing.append(time.time() - topo_t0)
nb_nodes.append(len(q))
max_nb_nodes = max(max_nb_nodes, len(q))
for node in fgraph.toposort(): for node in fgraph.toposort():
t0 = time.time()
if isinstance(node.op, HostFromGpu): if isinstance(node.op, HostFromGpu):
mapping[node.outputs[0]] = node.inputs[0] mapping[node.outputs[0]] = node.inputs[0]
continue continue
...@@ -310,7 +338,14 @@ class GraphToGPU(Optimizer): ...@@ -310,7 +338,14 @@ class GraphToGPU(Optimizer):
for lopt in (self.local_optimizers_all + for lopt in (self.local_optimizers_all +
self.local_optimizers_map.get(type(node.op), []) + self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_map.get(node.op, [])): self.local_optimizers_map.get(node.op, [])):
nb = change_tracker.nb_imported
process_count[lopt] += 1
global_process_count[lopt] += 1
t_opt = time.time()
lopt_change = self.process_node(fgraph, node, lopt)
time_opts[lopt] += time.time() - t_opt
node_created[lopt] += change_tracker.nb_imported - nb
if move_to_GPU: if move_to_GPU:
try: try:
new_ops = lopt.transform( new_ops = lopt.transform(
...@@ -322,6 +357,7 @@ class GraphToGPU(Optimizer): ...@@ -322,6 +357,7 @@ class GraphToGPU(Optimizer):
out_clients) out_clients)
if new_ops: if new_ops:
break break
local_opt_timing.append(float(time.time() - t0))
if not new_ops: if not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i) newnode = node.clone_with_new_inputs([mapping.get(i)
for i in node.inputs]) for i in node.inputs])
...@@ -344,6 +380,9 @@ class GraphToGPU(Optimizer): ...@@ -344,6 +380,9 @@ class GraphToGPU(Optimizer):
for new_o, old_o in zip(outputs, node.outputs): for new_o, old_o in zip(outputs, node.outputs):
mapping[old_o] = new_o mapping[old_o] = new_o
loop_process_count.append(process_count)
loop_timing.append(float(time.time() - t0))
new_nodes = [] new_nodes = []
for o in fgraph.outputs: for o in fgraph.outputs:
new_o = mapping[o] new_o = mapping[o]
...@@ -354,6 +393,143 @@ class GraphToGPU(Optimizer): ...@@ -354,6 +393,143 @@ class GraphToGPU(Optimizer):
new_nodes.append(new_o) new_nodes.append(new_o)
fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes)) fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes))
@staticmethod
def print_profile(stream, prof, level=0):
(opt, loop_timing, loop_process_count,
(start_nb_nodes, end_nb_nodes, max_nb_nodes),
local_opt_timing, nb_nodes, time_opts, io_toposort_timing,
node_created) = prof
blanc = (' ' * level)
print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
print(blanc, getattr(opt, "name",
getattr(opt, "__name__", "")), file=stream)
print(blanc, " time %.3fs for %d passes" % (
sum(loop_timing), len(loop_timing)), file=stream)
print(blanc, " nb nodes (start, end, max) %d %d %d" % (
start_nb_nodes, end_nb_nodes, max_nb_nodes), file=stream)
print(blanc, " time io_toposort %.3fs" % sum(
io_toposort_timing), file=stream)
s = sum([time_opts[o] for o in opt.local_optimizers_all])
print(blanc, " time in local optimizers %.3fs" % s, file=stream)
for i in range(len(loop_timing)):
lopt = ""
if loop_process_count[i]:
d = list(reversed(sorted(iteritems(loop_process_count[i]),
key=lambda a: a[1])))
lopt = " ".join([str((str(k), v)) for k, v
in d[:5]])
if len(d) > 5:
lopt += " ..."
print(blanc, (' %2d - %.3fs %d (%.3fs in global opts, '
'%.3fs io_toposort) - %d nodes - %s' % (
i, loop_timing[i],
sum(loop_process_count[i].values()),
local_opt_timing[i],
io_toposort_timing[i], nb_nodes[i],
lopt)), file=stream)
count_opt = []
not_used = []
not_used_time = 0
process_count = {}
for o in (opt.local_optimizers_all +
list(opt.local_optimizers_map.get(type(node.op), [])) +
list(opt.local_optimizers_map.get(node.op, [])) +):
process_count.setdefault(o, 0)
for count in loop_process_count:
for o, v in iteritems(count):
process_count[o] += v
for o, count in iteritems(process_count):
if count > 0:
count_opt.append((time_opts[o], count,
node_created[o], o))
else:
not_used.append((time_opts[o], o))
not_used_time += time_opts[o]
if count_opt:
print(blanc,
' times - times applied - nb node created - name:',
file=stream)
count_opt.sort()
for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]:
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print(blanc + " ", ' %.3fs - %s' % (t, o), file=stream)
print(file=stream)
@staticmethod
def merge_profile(prof1, prof2):
# (opt, loop_timing, loop_process_count, max_nb_nodes,
# global_opt_timing, nb_nodes, time_opts, io_toposort_timing) = prof1
local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
prof2[0].local_optimizers_all)
def merge_dict(d1, d2):
"""
merge 2 dicts by adding the values.
"""
d = d1.copy()
for k, v in iteritems(d2):
if k in d:
d[k] += v
else:
d[k] = v
return d
local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
prof2[0].local_optimizers_map)
new_opt = GraphToGPU(local_optimizers,local_optimizers_map)
def merge_list(l1, l2):
l = copy.copy(l1)
for idx, nb in enumerate(l2):
if idx < len(l):
l[idx] += nb
else:
l.append(nb)
return l
loop_timing = merge_list(prof1[1], prof2[1])
loop_process_count = list(prof1[2])
for i in range(min(len(loop_process_count), len(prof2[2]))):
process_count = loop_process_count[i]
for process, count in iteritems(prof2[2][i]):
if process in process_count:
process_count[process] += count
else:
process_count[process] = count
loop_process_count.extend(prof2[2][len(loop_process_count):])
max_nb_nodes = max(prof1[3], prof2[3])
nb_nodes = merge_list(prof1[4], prof2[4])
time_opts = merge_dict(prof1[5], prof2[5])
io_toposort_timing = merge_list(prof1[6], prof2[6])
assert len(loop_timing) == max(len(prof1[1]), len(prof2[1]))
node_created = merge_dict(prof1[7], prof2[7])
return (new_opt,
loop_timing,
loop_process_count,
max_nb_nodes,
nb_nodes,
time_opts,
io_toposort_timing,
node_created)
@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu]) @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
def local_cut_gpu_transfers(node): def local_cut_gpu_transfers(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论