提交 089fe03d authored 作者: Roy Xue's avatar Roy Xue

Add comments and code style fix for the while file

上级 92306ee6
...@@ -61,9 +61,9 @@ AddConfigVar('profiling.min_memory_size', ...@@ -61,9 +61,9 @@ AddConfigVar('profiling.min_memory_size',
in_c_key=False) in_c_key=False)
AddConfigVar('profiling.min_peak_memory', AddConfigVar('profiling.min_peak_memory',
"""The min peak memory usage of the order""", """The min peak memory usage of the order""",
BoolParam(False), BoolParam(False),
in_c_key=False) in_c_key=False)
AddConfigVar('profiling.destination', AddConfigVar('profiling.destination',
""" """
...@@ -93,10 +93,10 @@ def _atexit_print_fn(): ...@@ -93,10 +93,10 @@ def _atexit_print_fn():
if not isinstance(ps, ScanProfileStats): if not isinstance(ps, ScanProfileStats):
to_sum.append(ps) to_sum.append(ps)
else: else:
#TODO print the name if there is one! # TODO print the name if there is one!
print 'Skipping empty Profile' print 'Skipping empty Profile'
if len(to_sum) > 1: if len(to_sum) > 1:
# Make a global profile # Make a global profile
cum = copy.copy(to_sum[0]) cum = copy.copy(to_sum[0])
msg = ("Sum of all(%d) printed profiles at exit excluding Scan op" msg = ("Sum of all(%d) printed profiles at exit excluding Scan op"
" profile." % len(to_sum)) " profile." % len(to_sum))
...@@ -107,7 +107,7 @@ def _atexit_print_fn(): ...@@ -107,7 +107,7 @@ def _atexit_print_fn():
"validate_time"]: "validate_time"]:
setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr)) setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
#merge dictonary # merge dictonary
for attr in ["apply_time", "apply_callcount", for attr in ["apply_time", "apply_callcount",
"apply_cimpl", "variable_shape", "variable_strides"]: "apply_cimpl", "variable_shape", "variable_strides"]:
cum_attr = getattr(cum, attr) cum_attr = getattr(cum, attr)
...@@ -128,10 +128,8 @@ def _atexit_print_fn(): ...@@ -128,10 +128,8 @@ def _atexit_print_fn():
n_apply_to_print=config.profiling.n_apply) n_apply_to_print=config.profiling.n_apply)
class ProfileStats(object): class ProfileStats(object):
""" """
Object to store runtime and memory profiling information for all of Object to store runtime and memory profiling information for all of
Theano's operations: compilation, optimization, execution. Theano's operations: compilation, optimization, execution.
...@@ -211,8 +209,8 @@ class ProfileStats(object): ...@@ -211,8 +209,8 @@ class ProfileStats(object):
names of the class vars declared in this class. names of the class vars declared in this class.
""" """
if (hasattr(theano, 'sandbox') and if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled): theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1': if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception( raise Exception(
"You are running the Theano profiler with CUDA enabled." "You are running the Theano profiler with CUDA enabled."
...@@ -332,7 +330,7 @@ class ProfileStats(object): ...@@ -332,7 +330,7 @@ class ProfileStats(object):
local_time = 0 local_time = 0
if local_time == 0: if local_time == 0:
print >> file, ('ProfileMode.summary_class: total time 0' print >> file, ('ProfileMode.summary_class: total time 0'
' (did you forget to enable counters?)') ' (did you forget to enable counters?)')
return return
class_time = self.class_time() class_time = self.class_time()
class_call = self.class_callcount() class_call = self.class_callcount()
...@@ -341,12 +339,12 @@ class ProfileStats(object): ...@@ -341,12 +339,12 @@ class ProfileStats(object):
if N is None: if N is None:
N = len(self.class_time) N = len(self.class_time)
otimes = [(t * 100 / local_time, otimes = [(t * 100 / local_time,
t, t,
clas, clas,
class_impl.get(clas, ' '), class_impl.get(clas, ' '),
class_call.get(clas, 0), class_call.get(clas, 0),
class_apply.get(clas, 0)) class_apply.get(clas, 0))
for clas, t in class_time.items()] for clas, t in class_time.items()]
otimes.sort() otimes.sort()
otimes.reverse() otimes.reverse()
tot = 0 tot = 0
...@@ -405,9 +403,9 @@ class ProfileStats(object): ...@@ -405,9 +403,9 @@ class ProfileStats(object):
# that it way more readeable that the previous output of the # that it way more readeable that the previous output of the
# profiler # profiler
print >>file, ' ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\ print >>file, ' ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
% (max(0, len(otimes) - N), % (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]), sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:])) sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, '' print >> file, ''
def summary_ops(self, file=sys.stderr, N=None): def summary_ops(self, file=sys.stderr, N=None):
...@@ -417,19 +415,19 @@ class ProfileStats(object): ...@@ -417,19 +415,19 @@ class ProfileStats(object):
local_time = 0 local_time = 0
if local_time == 0: if local_time == 0:
print >> file, ('ProfileMode.summary_ops: total time 0' print >> file, ('ProfileMode.summary_ops: total time 0'
' (did you forget to enable counters?)') ' (did you forget to enable counters?)')
return return
op_time = self.op_time() op_time = self.op_time()
op_call = self.op_callcount() op_call = self.op_callcount()
op_apply = self.op_nodes() op_apply = self.op_nodes()
op_impl = self.op_impl() op_impl = self.op_impl()
otimes = [(t * 100 / local_time, otimes = [(t * 100 / local_time,
t, t,
op, op,
op_impl.get(op, ' '), op_impl.get(op, ' '),
op_call.get(op, 0), op_call.get(op, 0),
op_apply.get(op, 0)) op_apply.get(op, 0))
for op, t in op_time.items()] for op, t in op_time.items()]
otimes.sort() otimes.sort()
otimes.reverse() otimes.reverse()
tot = 0 tot = 0
...@@ -485,9 +483,9 @@ class ProfileStats(object): ...@@ -485,9 +483,9 @@ class ProfileStats(object):
# that it way more readeable that the previous output of the # that it way more readeable that the previous output of the
# profiler # profiler
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\ print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
% (max(0, len(otimes) - N), % (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]), sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:])) sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, '' print >> file, ''
def summary_nodes(self, file=sys.stderr, N=None): def summary_nodes(self, file=sys.stderr, N=None):
...@@ -497,7 +495,7 @@ class ProfileStats(object): ...@@ -497,7 +495,7 @@ class ProfileStats(object):
local_time = 0 local_time = 0
if local_time == 0: if local_time == 0:
print >> file, ('ProfileMode.summary_nodes: total time 0' print >> file, ('ProfileMode.summary_nodes: total time 0'
' (did you forget to enable counters?)') ' (did you forget to enable counters?)')
return return
print >> file, 'Apply' print >> file, 'Apply'
...@@ -572,15 +570,15 @@ class ProfileStats(object): ...@@ -572,15 +570,15 @@ class ProfileStats(object):
for var in a.inputs], for var in a.inputs],
[self.variable_shape[var] [self.variable_shape[var]
for var in a.outputs]) for var in a.outputs])
flops = '%8.1f' % (fl/1024./1024) flops = '%8.1f' % (fl / 1024. / 1024)
flops_s = '%10.1f' % (fl/1024./1024/1024/t) flops_s = '%10.1f' % (fl / 1024. / 1024 / 1024 / t)
else: else:
flops = " " flops = " "
flops_s = " " flops_s = " "
print >> file, format_str %(f, ftot, t, t / nb_call, nb_call, print >> file, format_str % (f, ftot, t, t / nb_call, nb_call,
nd_id, nd_id,
flops, flops_s, flops, flops_s,
str(a)[:maxlen]) str(a)[:maxlen])
if not config.profile_memory: if not config.profile_memory:
continue continue
for idx, var in enumerate(a.inputs): for idx, var in enumerate(a.inputs):
...@@ -597,12 +595,12 @@ class ProfileStats(object): ...@@ -597,12 +595,12 @@ class ProfileStats(object):
idx, dtype, sh, st) idx, dtype, sh, st)
# Same as before, this I've sacrificied some information making # Same as before, this I've sacrificied some information making
# the output more readable # the output more readable
#print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%( # print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
# f, ftot, t, tot, t/nb_call,nb_call, str(a)) # f, ftot, t, tot, t/nb_call,nb_call, str(a))
print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\ print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
% (max(0, len(atimes) - N), % (max(0, len(atimes) - N),
sum(f for f, t, a, nd_id, nb_call in atimes[N:]), sum(f for f, t, a, nd_id, nb_call in atimes[N:]),
sum(t for f, t, a, nd_id, nb_call in atimes[N:])) sum(t for f, t, a, nd_id, nb_call in atimes[N:]))
print >> file, '' print >> file, ''
def summary_function(self, file): def summary_function(self, file):
...@@ -610,15 +608,15 @@ class ProfileStats(object): ...@@ -610,15 +608,15 @@ class ProfileStats(object):
print >> file, '==================' print >> file, '=================='
print >> file, ' Message: %s' % self.message print >> file, ' Message: %s' % self.message
print >> file, ' Time in %i calls to Function.__call__: %es' % ( print >> file, ' Time in %i calls to Function.__call__: %es' % (
self.fct_callcount, self.fct_call_time) self.fct_callcount, self.fct_call_time)
if self.fct_call_time > 0: if self.fct_call_time > 0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' % ( print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' % (
self.vm_call_time, self.vm_call_time,
100 * self.vm_call_time / self.fct_call_time) 100 * self.vm_call_time / self.fct_call_time)
local_time = sum(self.apply_time.values()) local_time = sum(self.apply_time.values())
if local_time > 0: if local_time > 0:
print >> file, ' Time in thunks: %es (%.3f%%)' % ( print >> file, ' Time in thunks: %es (%.3f%%)' % (
local_time, 100*local_time / self.fct_call_time) local_time, 100 * local_time / self.fct_call_time)
print >> file, ' Total compile time: %es' % self.compile_time print >> file, ' Total compile time: %es' % self.compile_time
print >> file, ' Number of Apply nodes: %s' % len(self.apply_time) print >> file, ' Number of Apply nodes: %s' % len(self.apply_time)
print >> file, ' Theano Optimizer time: %es' % self.optimizer_time print >> file, ' Theano Optimizer time: %es' % self.optimizer_time
...@@ -656,7 +654,7 @@ class ProfileStats(object): ...@@ -656,7 +654,7 @@ class ProfileStats(object):
fct_shapes[node.fgraph][node].append(sh) fct_shapes[node.fgraph][node].append(sh)
node_mem[node] = sum_dense node_mem[node] = sum_dense
#Find the function that used the most of that statistic # Find the function that used the most of that statistic
max_sum_size = 0 max_sum_size = 0
max_node_memory_size = 0 max_node_memory_size = 0
max_running_max_memory_size = 0 max_running_max_memory_size = 0
...@@ -707,12 +705,13 @@ class ProfileStats(object): ...@@ -707,12 +705,13 @@ class ProfileStats(object):
# two data structure used to mimic Python gc # two data structure used to mimic Python gc
viewed_by = {} # {var1: [vars that view var1]} viewed_by = {} # {var1: [vars that view var1]}
# The len of the list is the value of python ref count. But we use a list, not just the ref count value. # The len of the list is the value of python ref count. But we use a list, not just the ref count value.
# This is more safe to help detect potential bug in the algo # This is more safe to help detect potential bug in the algo
for var in fgraph.variables: for var in fgraph.variables:
viewed_by[var] = [] viewed_by[var] = []
view_of = {} # {var1: original var viewed by var1} view_of = {} # {var1: original var viewed by var1}
# The orignal mean that we don't keep trac of all the intermediate relationship in the view. # The orignal mean that we don't keep trac of all the intermediate
# relationship in the view.
for node in order: for node in order:
for var in node.outputs: for var in node.outputs:
...@@ -738,12 +737,14 @@ class ProfileStats(object): ...@@ -738,12 +737,14 @@ class ProfileStats(object):
ins = None ins = None
if dmap and idx2 in dmap: if dmap and idx2 in dmap:
vidx = dmap[idx2] vidx = dmap[idx2]
assert len(vidx) == 1, "Here we only support the possibility to destroy one input" assert len(
vidx) == 1, "Here we only support the possibility to destroy one input"
ins = node.inputs[vidx[0]] ins = node.inputs[vidx[0]]
if vmap and idx2 in vmap: if vmap and idx2 in vmap:
assert ins is None assert ins is None
vidx = vmap[idx2] vidx = vmap[idx2]
assert len(vidx) == 1, "Here we only support the possibility to view one input" assert len(
vidx) == 1, "Here we only support the possibility to view one input"
ins = node.inputs[vidx[0]] ins = node.inputs[vidx[0]]
if ins is not None: if ins is not None:
# This is needed for destroy_map in case it # This is needed for destroy_map in case it
...@@ -768,17 +769,17 @@ class ProfileStats(object): ...@@ -768,17 +769,17 @@ class ProfileStats(object):
assert not (ins in view_of and viewed_by[ins]) assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen # we trac the original var, so this shouldn't happen
if (dependencies[ins] and if (dependencies[ins] and
ins not in fgraph.outputs and ins not in fgraph.outputs and
ins.owner and ins.owner and
all([compute_map[v][0] for v in dependencies[ins]])): all([compute_map[v][0] for v in dependencies[ins]])):
if ins not in view_of and not viewed_by.get(ins, []): if ins not in view_of and not viewed_by.get(ins, []):
running_memory_size -= var_mem[ins] running_memory_size -= var_mem[ins]
elif ins in view_of: elif ins in view_of:
origin = view_of[ins] origin = view_of[ins]
viewed_by[origin].remove(ins) viewed_by[origin].remove(ins)
if (not viewed_by[origin] and if (not viewed_by[origin] and
origin not in fgraph.inputs and origin not in fgraph.inputs and
not isinstance(origin, theano.Constant)): not isinstance(origin, theano.Constant)):
running_memory_size -= var_mem[origin] running_memory_size -= var_mem[origin]
else: else:
# ins is viewed_by something else, so its # ins is viewed_by something else, so its
...@@ -859,12 +860,14 @@ class ProfileStats(object): ...@@ -859,12 +860,14 @@ class ProfileStats(object):
ins = None ins = None
if dmap and idx in dmap: if dmap and idx in dmap:
vidx = dmap[idx] vidx = dmap[idx]
assert len(vidx) == 1, "Here we only support the possibility to destroy one input" assert len(
vidx) == 1, "Here we only support the possibility to destroy one input"
ins = node.inputs[vidx[0]] ins = node.inputs[vidx[0]]
if vmap and idx in vmap: if vmap and idx in vmap:
assert ins is None assert ins is None
vidx = vmap[idx] vidx = vmap[idx]
assert len(vidx) == 1, "Here we only support the possibility to destroy one input" assert len(
vidx) == 1, "Here we only support the possibility to destroy one input"
ins = node.inputs[vidx[0]] ins = node.inputs[vidx[0]]
if ins is not None: if ins is not None:
# This is needed for destroy_map in case it # This is needed for destroy_map in case it
...@@ -889,11 +892,12 @@ class ProfileStats(object): ...@@ -889,11 +892,12 @@ class ProfileStats(object):
for ins in node.inputs: for ins in node.inputs:
assert not (ins in view_of and assert not (ins in view_of and
viewed_by[ins]) viewed_by[ins])
# We track of the original var, so this shouldn't happen # We track of the original var, so this shouldn't
# happen
if (dependencies[ins] and if (dependencies[ins] and
ins not in fgraph.outputs and ins not in fgraph.outputs and
ins.owner and ins.owner and
all([compute_map[v][0] for v in dependencies[ins]])): all([compute_map[v][0] for v in dependencies[ins]])):
if ins not in view_of and not viewed_by.get(ins, []): if ins not in view_of and not viewed_by.get(ins, []):
mem_freed += var_mem[ins] mem_freed += var_mem[ins]
elif ins in view_of: elif ins in view_of:
...@@ -901,8 +905,8 @@ class ProfileStats(object): ...@@ -901,8 +905,8 @@ class ProfileStats(object):
viewed_by[origin].remove(ins) viewed_by[origin].remove(ins)
viewedby_remove[origin].append(ins) viewedby_remove[origin].append(ins)
if (not viewed_by[origin] and if (not viewed_by[origin] and
origin not in fgraph.inputs and origin not in fgraph.inputs and
not isinstance(origin, theano.Constant)): not isinstance(origin, theano.Constant)):
mem_freed += var_mem[origin] mem_freed += var_mem[origin]
else: else:
# ins is viewed_by something else, so its # ins is viewed_by something else, so its
...@@ -913,9 +917,14 @@ class ProfileStats(object): ...@@ -913,9 +917,14 @@ class ProfileStats(object):
done_set.add(node) done_set.add(node)
frozen_set = frozenset(done_set) frozen_set = frozenset(done_set)
if done_dict.get(frozen_set, max_mem_count+1) > max_mem_count: if done_dict.get(frozen_set, max_mem_count + 1) > max_mem_count:
# check if frozen_set is in done_set
# no, add it to done_set
# yes, then compare the past mem and current mem
# bigger, update the value and continue
# smaller, stop this iteration, move to next node
done_dict[frozen_set] = max_mem_count done_dict[frozen_set] = max_mem_count
for var in node.outputs: for var in node.outputs:
for c, _ in var.clients: for c, _ in var.clients:
if c != "output": if c != "output":
...@@ -928,7 +937,8 @@ class ProfileStats(object): ...@@ -928,7 +937,8 @@ class ProfileStats(object):
if max_mem_count < mem_bound: if max_mem_count < mem_bound:
mem_bound = max_mem_count mem_bound = max_mem_count
else: else:
min_memory_generator(new_exec_nodes, viewed_by, view_of) min_memory_generator(
new_exec_nodes, viewed_by, view_of)
# Reset track variables # Reset track variables
done_set.remove(node) done_set.remove(node)
...@@ -949,7 +959,6 @@ class ProfileStats(object): ...@@ -949,7 +959,6 @@ class ProfileStats(object):
for k in viewof_change: for k in viewof_change:
del view_of[k] del view_of[k]
# two data structure used to mimic Python gc # two data structure used to mimic Python gc
viewed_by = {} # {var1: [vars that view var1]} viewed_by = {} # {var1: [vars that view var1]}
# The len of the list is the value of python ref count. But we use a list, not just the ref count value. # The len of the list is the value of python ref count. But we use a list, not just the ref count value.
...@@ -957,7 +966,8 @@ class ProfileStats(object): ...@@ -957,7 +966,8 @@ class ProfileStats(object):
for var in fgraph.variables: for var in fgraph.variables:
viewed_by[var] = [] viewed_by[var] = []
view_of = {} # {var1: original var viewed by var1} view_of = {} # {var1: original var viewed by var1}
# The orignal mean that we don't keep trac of all the intermediate relationship in the view. # The orignal mean that we don't keep trac of all the intermediate
# relationship in the view.
min_memory_generator(executable_nodes, viewed_by, view_of) min_memory_generator(executable_nodes, viewed_by, view_of)
...@@ -1027,11 +1037,11 @@ class ProfileStats(object): ...@@ -1027,11 +1037,11 @@ class ProfileStats(object):
# round(max_sum_size / 1024)) # round(max_sum_size / 1024))
print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round( print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size / 1024.)), int(round( new_max_node_memory_size / 1024.)), int(round(
max_node_memory_size / 1024.))) max_node_memory_size / 1024.)))
print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round( print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size / 1024.)), int(round( new_max_running_max_memory_size / 1024.)), int(round(
max_running_max_memory_size / 1024.))) max_running_max_memory_size / 1024.)))
if min_max_peak: if min_max_peak:
print >> file, " Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round( print >> file, " Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
min_max_peak / 1024.)), min_peak_time) min_max_peak / 1024.)), min_peak_time)
...@@ -1039,7 +1049,8 @@ class ProfileStats(object): ...@@ -1039,7 +1049,8 @@ class ProfileStats(object):
round(new_max_node_memory_saved_by_view / 1024.)), int( round(new_max_node_memory_saved_by_view / 1024.)), int(
round(max_node_memory_saved_by_view / 1024.))) round(max_node_memory_saved_by_view / 1024.)))
print >> file, " Memory saved if inplace ops are used: %dKB (%dKB)" % \ print >> file, " Memory saved if inplace ops are used: %dKB (%dKB)" % \
(int(round(new_max_node_memory_saved_by_inplace / 1024.)), int(round(max_node_memory_saved_by_inplace / 1024.))) (int(round(new_max_node_memory_saved_by_inplace / 1024.)),
int(round(max_node_memory_saved_by_inplace / 1024.)))
print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int( print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int(
round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int( round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.)) round(max_node_memory_size - max_running_max_memory_size) / 1024.))
...@@ -1096,8 +1107,8 @@ class ProfileStats(object): ...@@ -1096,8 +1107,8 @@ class ProfileStats(object):
print >> file, ( print >> file, (
' ... (remaining %i Apply account for %4dB/%dB (%s) of the' ' ... (remaining %i Apply account for %4dB/%dB (%s) of the'
' Apply with dense outputs sizes)') % (max(0, len(node_mem) - N), ' Apply with dense outputs sizes)') % (max(0, len(node_mem) - N),
sum_remaining, sum_remaining,
size_sum_dense, p size_sum_dense, p
) )
print >> file, '' print >> file, ''
if N == 0: if N == 0:
...@@ -1112,7 +1123,6 @@ class ProfileStats(object): ...@@ -1112,7 +1123,6 @@ class ProfileStats(object):
" emitted in those cases.") " emitted in those cases.")
print >> file, '' print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20, def summary(self, file=sys.stderr, n_ops_to_print=20,
n_apply_to_print=20): n_apply_to_print=20):
self.summary_function(file) self.summary_function(file)
...@@ -1133,11 +1143,9 @@ class ProfileStats(object): ...@@ -1133,11 +1143,9 @@ class ProfileStats(object):
self.optimizer_profile[1]) self.optimizer_profile[1])
if 0: # old code still to be ported from ProfileMode
if 0: # old code still to be ported from ProfileMode
def long_print(self, file=sys.stderr, fct_name=None, message=None, def long_print(self, file=sys.stderr, fct_name=None, message=None,
n_apply_to_print=15, n_ops_to_print=20, print_apply=False): n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
""" """
Print a readable summary of the stats. Print a readable summary of the stats.
...@@ -1159,7 +1167,7 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1159,7 +1167,7 @@ if 0: # old code still to be ported from ProfileMode
sop_time = {} sop_time = {}
sop_call = {} sop_call = {}
sop_op = {} sop_op = {}
#map each op class to Bool. True iff all applies were done in c. # map each op class to Bool. True iff all applies were done in c.
sop_c = {} sop_c = {}
for a, t in op_time.items(): for a, t in op_time.items():
typ = type(a) typ = type(a)
...@@ -1186,13 +1194,13 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1186,13 +1194,13 @@ if 0: # old code still to be ported from ProfileMode
msg = '*' msg = '*'
else: else:
msg = ' ' msg = ' '
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a) print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t / nb_call, msg, nb_call, nb_op, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\ print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
% (max(0, len(sotimes) - n_ops_to_print), % (max(0, len(sotimes) - n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in sum(f for f, t, a, ci, nb_call, nb_op in
sotimes[n_ops_to_print:]), sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in sum(t for f, t, a, ci, nb_call, nb_op in
sotimes[n_ops_to_print:])) sotimes[n_ops_to_print:]))
total_time = time.time() - import_time total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values()) total_fct_time = sum(fct_call_time.values())
...@@ -1202,13 +1210,13 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1202,13 +1210,13 @@ if 0: # old code still to be ported from ProfileMode
print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>' print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
for key in fct_call.keys(): for key in fct_call.keys():
if fct_call[key] > 0: if fct_call[key] > 0:
print ' %4.1f%% %.3fs %.2es %d %s'%( print ' %4.1f%% %.3fs %.2es %d %s' % (
fct_call_time[key] / total_fct_time * 100, fct_call_time[key] / total_fct_time * 100,
fct_call_time[key], fct_call_time[key],
fct_call_time[key] / fct_call[key], fct_call_time[key] / fct_call[key],
fct_call[key], key.name) fct_call[key], key.name)
else: else:
print ' NOT CALLED',key.name print ' NOT CALLED', key.name
if total_fct_time > 0: if total_fct_time > 0:
time_pr_in_fct = local_time / total_fct_time * 100 time_pr_in_fct = local_time / total_fct_time * 100
...@@ -1227,8 +1235,8 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1227,8 +1235,8 @@ if 0: # old code still to be ported from ProfileMode
print (' Theano Op time (included in fct call, Time spent ' print (' Theano Op time (included in fct call, Time spent '
'running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)' % 'running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)' %
(local_time, local_time / total_time * 100, time_pr_in_fct)) (local_time, local_time / total_time * 100, time_pr_in_fct))
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100) print 'Other time since import %.3fs %.1f%%' % (other_time, other_time / total_time * 100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call) print '%i Theano fct call, %.3fs per call' % (total_fct_call, time_per_call)
print print
print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code." print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
...@@ -1236,7 +1244,7 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1236,7 +1244,7 @@ if 0: # old code still to be ported from ProfileMode
for fct in fct_call.keys(): for fct in fct_call.keys():
for idx, node in enumerate(fct.maker.fgraph.toposort()): for idx, node in enumerate(fct.maker.fgraph.toposort()):
if any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.outputs) and not any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.inputs): if any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.outputs) and not any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.inputs):
print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs]) print str(node), idx, fct.name, str([getattr(i, 'dtype', None) for i in node.inputs]), str([getattr(i, 'dtype', None) for i in node.outputs])
if any([x[2].__name__.startswith("Gpu") for x in sotimes]): if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
cpu = [] cpu = []
...@@ -1254,8 +1262,8 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1254,8 +1262,8 @@ if 0: # old code still to be ported from ProfileMode
sum_trans = sum(so[1] for so in trans) sum_trans = sum(so[1] for so in trans)
print print
print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%( print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100) sum_cpu, sum_cpu / local_time * 100, sum_gpu, sum_gpu / local_time * 100, sum_trans, sum_trans / local_time * 100)
print "Theano function input that are float64" print "Theano function input that are float64"
print "<fct name> <input name> <input type> <str input>" print "<fct name> <input name> <input type> <str input>"
...@@ -1270,20 +1278,25 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1270,20 +1278,25 @@ if 0: # old code still to be ported from ProfileMode
from theano.tensor.raw_random import RandomFunction from theano.tensor.raw_random import RandomFunction
import theano import theano
import theano.scalar as scal import theano.scalar as scal
scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ] scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum,
scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos, scal.Sin, scal.Tan, scal.Tanh, scal.Cosh, scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr]
scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp,
scal.Sqrt, scal.Abs, scal.Cos, scal.Sin, scal.Tan, scal.Tanh, scal.Cosh, scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus] # Abs, Mod in float{32,64} only
def get_scalar_ops(s): def get_scalar_ops(s):
if isinstance(s, theano.scalar.Composite): if isinstance(s, theano.scalar.Composite):
l = [] l = []
for node in s.fgraph.toposort(): for node in s.fgraph.toposort():
l+=get_scalar_ops(node.op) l += get_scalar_ops(node.op)
return l return l
else: return [s] else:
return [s]
def list_scalar_op(op): def list_scalar_op(op):
if isinstance(op.scalar_op, theano.scalar.Composite): if isinstance(op.scalar_op, theano.scalar.Composite):
return get_scalar_ops(op.scalar_op) return get_scalar_ops(op.scalar_op)
else: return [op.scalar_op] else:
return [op.scalar_op]
def amdlibm_speed_up(op): def amdlibm_speed_up(op):
if not isinstance(op, T.Elemwise): if not isinstance(op, T.Elemwise):
...@@ -1296,6 +1309,7 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1296,6 +1309,7 @@ if 0: # old code still to be ported from ProfileMode
elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up: elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
print "We don't know if amdlibm will accelerate this scalar op.", s_op print "We don't know if amdlibm will accelerate this scalar op.", s_op
return False return False
def exp_float32_op(op): def exp_float32_op(op):
if not isinstance(op, T.Elemwise): if not isinstance(op, T.Elemwise):
return False return False
...@@ -1303,30 +1317,30 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1303,30 +1317,30 @@ if 0: # old code still to be ported from ProfileMode
l = list_scalar_op(op) l = list_scalar_op(op)
return any([s_op.__class__ in [scal.Exp] for s_op in l]) return any([s_op.__class__ in [scal.Exp] for s_op in l])
#tip 1 # tip 1
if config.floatX=='float64': if config.floatX == 'float64':
print " - Try the Theano flag floatX=float32" print " - Try the Theano flag floatX=float32"
#tip 2 # tip 2
if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]): if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i, a in apply_time]):
print " - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation." print " - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
#tip 3 # tip 3
if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]): if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype == 'float32' for i, a in apply_time]):
print " - With the default gcc libm, exp in float32 is slower than in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True" print " - With the default gcc libm, exp in float32 is slower than in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
#tip 4 # tip 4
for a, t in apply_time.iteritems(): for a, t in apply_time.iteritems():
node = a node = a
if (isinstance(node.op, T.Dot) and if (isinstance(node.op, T.Dot) and
all([len(i.type.broadcastable) == 2 for i in node.inputs])): all([len(i.type.broadcastable) == 2 for i in node.inputs])):
print (" - You have a dot operation that was not optimized " print (" - You have a dot operation that was not optimized "
"to dot22 that is faster. Make sure the inputs are " "to dot22 that is faster. Make sure the inputs are "
"float32 or float64 and are the same for both inputs. " "float32 or float64 and are the same for both inputs. "
"Currently they are: %s" % "Currently they are: %s" %
[i.type for i in node.inputs]) [i.type for i in node.inputs])
#tip 5 # tip 5
for a, t in apply_time.iteritems(): for a, t in apply_time.iteritems():
node = a node = a
if isinstance(node.op, RandomFunction): if isinstance(node.op, RandomFunction):
...@@ -1378,15 +1392,15 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1378,15 +1392,15 @@ if 0: # old code still to be ported from ProfileMode
outputs_size = self.outputs_size outputs_size = self.outputs_size
self.print_summary_("print_summary", self.print_summary_("print_summary",
None, None,
None, None,
None, None,
apply_time, apply_time,
op_cimpl, op_cimpl,
message, message,
outputs_size, outputs_size,
n_apply_to_print, n_apply_to_print,
n_ops_to_print) n_ops_to_print)
def print_diff_summary(self, other, n_apply_to_print=15, def print_diff_summary(self, other, n_apply_to_print=15,
n_ops_to_print=20): n_ops_to_print=20):
...@@ -1414,7 +1428,7 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1414,7 +1428,7 @@ if 0: # old code still to be ported from ProfileMode
tb = b_time.pop(a, 0) tb = b_time.pop(a, 0)
r[a] += ta - tb r[a] += ta - tb
#they are missing in a # they are missing in a
for a, t in b_time.items(): for a, t in b_time.items():
r.setdefault(a, 0) r.setdefault(a, 0)
r[a] += t r[a] += t
...@@ -1429,12 +1443,10 @@ if 0: # old code still to be ported from ProfileMode ...@@ -1429,12 +1443,10 @@ if 0: # old code still to be ported from ProfileMode
outputs_size = diff_dict(self.outputs_size, other.outputs_size) outputs_size = diff_dict(self.outputs_size, other.outputs_size)
self.print_summary_( self.print_summary_(
"print_diff_summary", compile_time, fct_call_time, fct_call, "print_diff_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, outputs_size, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=n_apply_to_print, n_apply_to_print=n_apply_to_print,
n_ops_to_print=n_ops_to_print, print_apply=False) n_ops_to_print=n_ops_to_print, print_apply=False)
class ScanProfileStats(ProfileStats): class ScanProfileStats(ProfileStats):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论