提交 91a31814 authored 作者: lamblin's avatar lamblin

Merge pull request #1277 from nouiz/memory_prof

Memory profiler in the new profiler
......@@ -106,6 +106,38 @@ default values.
*Default:* ``id(self)``
.. method:: get_shape_info(obj)
Optional. Only needed to profile the memory of this Type of object
Return the information needed to compute the memory size of obj.
The memory size is only the data, so this exclude the container.
For an ndarray, this is the data, but not the ndarray object and
others data structures as shape and strides.
get_shape_info() and get_size() work in tendem for the memory profiler.
get_shape_info() is called during the execution of the function.
So it is better that it is not too slow.
get_size() will be called with the output of this function
when printing the memory profile.
:param obj: The object that this Type represent during execution
:return: Python object that self.get_size() understand
.. method:: get_size(shape_info)
Number of bytes taken by the object represented by shape_info
Optional. Only needed to profile the memory of this Type of object
:param shape_info: the output of the call to get_shape_info()
:return: the number of bytes taken by the object described in
shape_info.
"""
For each method, the *default* is what ``Type`` defines
for you. So, if you create an instance of ``Type`` or an
instance of a subclass of ``Type``, you
......
......@@ -273,6 +273,15 @@ import theano and print the config variable, as in:
Do the vm/cvm linkers profile the execution of Theano functions?
.. attribute:: profile_memory
Bool value: either True or False
Default False
Do the vm/cvm linkers profile the memory of Theano functions get printed?
It only work when profile=True.
.. attribute:: profile_optimizer
Bool value: either True or False
......@@ -280,6 +289,26 @@ import theano and print the config variable, as in:
Default False
Do the vm/cvm linkers profile the optimization phase when compiling a Theano function?
It only work when profile=True.
.. attribute:: profiling.n_apply
Positive int value, default: 20.
The number of apply node to print in the profiler output
.. attribute:: profiling.n_ops
Positive int value, default: 20.
The number of ops to print in the profiler output
.. attribute:: profiling.min_memory_size
Positive int value, default: 1024.
For the memory profile, do not print apply nodes if the size
of their outputs (in bytes) is lower then this.
.. attribute:: config.lib.amdlibm
......
......@@ -60,7 +60,7 @@ from theano.compile import \
FunctionMaker, function, OpFromGraph, \
Component, External, Member, Method, \
Composite, ComponentList, ComponentDict, Module, \
ProfileMode, \
ProfileMode, ProfileStats, \
Param, shared
from theano.misc.safe_asarray import _asarray
......
......@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode
from theano.compile.monitormode import MonitorMode
from theano.compile.profiling import ProfileStats, ScanProfileStats
from theano.compile.profilemode import ProfileMode
from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable
from theano.compile.sharedvalue import (shared, shared_constructor,
SharedVariable)
from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
from theano.compile.function import function
import atexit
import copy
import os
import time
import theano
from theano.gof.link import WrapLinker
from theano.compile.mode import (Mode, register_mode,
predefined_modes, predefined_linkers,
......@@ -42,6 +44,18 @@ class Profile_Maker(FunctionMaker):
def create(self, input_storage=None, trustme=False):
ret = super(Profile_Maker, self).create(input_storage, trustme)
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running Theano profiler with CUDA enabled."
" Theano GPU ops execution are asynchron by default."
" So by default, the profile is useless."
" You must use set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
" synchonize the execution to get meaning full profile.")
# create a function-specific storage container for profiling info
profile = ProfileStats(atexit_print=False)
self.mode.profile_stats[ret] = profile
......@@ -50,7 +64,6 @@ class Profile_Maker(FunctionMaker):
#initialize the timers
for i, node in enumerate(ret.maker.fgraph.toposort()):
profile.apply_time[node] = 0.0
profile.outputs_size[node] = [0.0] * len(node.outputs)
# a thunk_group is a list of the thunks from each linker
# corresponding to the i'th position in the toposort.
......@@ -70,13 +83,13 @@ class Profile_Maker(FunctionMaker):
def new_fn():
self.mode.apply_time = self.mode.profile_stats[ret].apply_time
self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size
self.mode.variable_shape = self.mode.profile_stats[ret].variable_shape
ret_fn()
# delete the old apply_time variable
# because it doesn't mean the same thing anymore.
# This prevents old code from looking like it still works.
del self.mode.apply_time
del self.mode.outputs_size
del self.mode.variable_shape
ret.fn = new_fn
......@@ -165,34 +178,10 @@ class ProfileMode(Mode):
t0 = time.time()
th()
dt = time.time() - t0
size = []
for o in th.outputs:
if not hasattr(o[0], 'size'):
#if the output type don't have a size attribute, set -1
#to signify we can't evaluate it.
#This happen at least for mtrand.RandomState type(in numpy)
size.append(-1)
continue
s = o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for
#CudaNdarray
dtype = str(o[0].dtype)
dtype2 = dtype[-2:]
if dtype2 == '32':
s *= 4
elif dtype2 == '64':
s *= 8
elif dtype2 == '16':
s *= 2
elif dtype[-1] == '8':
s *= 1
elif dtype[-3:] == '128':
s *= 16
else:
raise Exception("Can't determine the memory size of dtype",
o[0].dtype)
size.append(s)
self.outputs_size[node] = size
for var, data in zip(node.outputs, th.outputs):
sh = getattr(data[0], 'shape', 'input no shape')
self.variable_shape[var] = sh
self.apply_time[node] += max(dt, 1e-14)
self.provided_linker = linker
......@@ -247,15 +236,14 @@ class ProfileMode(Mode):
print i, n
apply_cimpl = {}
outputs_size = {}
for fn, ps in self.profile_stats.items():
apply_cimpl.update(ps.apply_cimpl)
message = self.message
outputs_size = {}
variable_shape = {}
for fn, ps in self.profile_stats.items():
outputs_size.update(ps.outputs_size)
variable_shape.update(ps.variable_shape)
other_time = dict(
linker_time=sum(
......@@ -265,7 +253,7 @@ class ProfileMode(Mode):
self.print_summary_("print_summary",
compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
apply_time, apply_cimpl, message, variable_shape,
self.local_time, other_time,
**kwargs)
......@@ -300,19 +288,19 @@ class ProfileMode(Mode):
apply_time = diff_dict(self.apply_time, other.apply_time)
apply_cimpl = self.apply_cimpl and other.apply_cimpl
message = self.message
outputs_size = diff_dict(self.outputs_size, other.outputs_size)
variable_shape = diff_dict(self.variable_shape, other.variable_shape)
other_time = {'linker_time': self.linker_time - other.linker_time,
'optimizer_time': self.optimizer_time -
other.optimizer_time}
self.print_summary_("print_diff_summary", compile_time,
fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
apply_time, apply_cimpl, message, variable_shape,
print_apply=False, other_time=other_time,
**kwargs)
@staticmethod
def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
apply_time, apply_cimpl, message, variable_shape,
local_time, other_time,
n_apply_to_print=config.ProfileMode.n_apply_to_print,
n_ops_to_print=config.ProfileMode.n_ops_to_print,
......@@ -329,6 +317,10 @@ class ProfileMode(Mode):
whose outputs memory size is lower then that.
"""
print "ProfileMode is deprecated! Use the new profiler."
print " The Theano flags to enable it ise: profile=True"
print " The Theano flags for the memory profile to it is: profile_memory=True"
total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values())
total_fct_call = sum(fct_call.values())
......@@ -502,96 +494,17 @@ class ProfileMode(Mode):
print '(*) Op is running a c implementation'
for printer in profiler_printers:
printer(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
apply_time, apply_cimpl, message, variable_shape,
other_time)
if not outputs_size:
if not variable_shape:
print """\nProfile of Theano intermediate memory disabled.
To enabled, put the Theano flag ProfileMode.profile_memory to True."""
else:
fct_memory={}#fgraph->dict(node->(outputs size))
var_mem = {}
for node, val in outputs_size.items():
fct_memory.setdefault(node.fgraph, {})
fct_memory[node.fgraph][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
print "(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)"
nb_skipped = 0
for fgraph, nodes_mem in fct_memory.iteritems():
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
if size_sum < min_memory_size:
nb_skipped += 1
continue
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.fgraph is fgraph][0].name
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = self.linker.schedule(fgraph)
computed, last_user = gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in fgraph.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
n_apply_printed = 0
for key,val in items[:n_apply_to_print]:
if sum(val) < min_memory_size:
break
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
n_apply_printed += 1
print ' ... (remaining %i Apply account for %.2f%%(%d bytes) of the total intermediate memory used)'\
%(max(0, len(nodes_mem)-n_apply_printed),
sum(sum(val) for key, val in items[n_apply_printed:])/float(size_sum),
sum(sum(val) for key, val in items[n_apply_printed:]))
if nb_skipped > 0:
print ' We skipped %d theano function(s). Each of them used less then %dB(theano flags ProfileMode.min_memory_size) of total intermediate memory size'%(nb_skipped, min_memory_size)
print """
The memory profile in ProfileMode is removed!
Use the new profiler. Use the Theano flags
profile=True,profile_memory=True to enable it."""
print
print """Here are tips to potentially make your code run faster
......
......@@ -16,13 +16,15 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en"
import atexit
import copy
import os
import sys
import time
import numpy
import theano
from theano.configparser import AddConfigVar, BoolParam
from theano.configparser import AddConfigVar, BoolParam, IntParam
import_time = time.time()
config = theano.config
......@@ -32,7 +34,23 @@ _atexit_print_file = sys.stderr
AddConfigVar('profiling.time_thunks',
"""Time individual thunks when profiling""",
BoolParam(True))
BoolParam(True))
AddConfigVar('profiling.n_apply',
"Number of apply instances to print by default",
IntParam(20, lambda i: i > 0),
in_c_key=False)
AddConfigVar('profiling.n_ops',
"Number of ops to print by default",
IntParam(20, lambda i: i > 0),
in_c_key=False)
AddConfigVar('profiling.min_memory_size',
"""For the memory profile, do not print apply nodes if the size
of their outputs (in bytes) is lower then this threshold""",
IntParam(1024, lambda i: i >= 0),
in_c_key=False)
def _atexit_print_fn():
......@@ -41,7 +59,9 @@ def _atexit_print_fn():
printed = 0
for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0:
ps.summary(file=_atexit_print_file)
ps.summary(file=_atexit_print_file,
n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply)
printed += 1
else:
print 'Skipping empty Profile'
......@@ -59,7 +79,7 @@ def _atexit_print_fn():
#merge dictonary
for attr in ["apply_time", "apply_callcount",
"apply_cimpl", "outputs_size"]:
"apply_cimpl", "variable_shape", "variable_strides"]:
cum_attr = getattr(cum, attr)
for key, val in getattr(ps, attr).iteritems():
assert key not in cum_attr
......@@ -73,7 +93,9 @@ def _atexit_print_fn():
else:
cum.optimizer_profile = None
cum.summary(file=_atexit_print_file)
cum.summary(file=_atexit_print_file,
n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply)
atexit.register(_atexit_print_fn)
......@@ -125,8 +147,12 @@ class ProfileStats(object):
# pretty string to print in summary, to identify this output
#
outputs_size = None
# node -> size of allocated output
variable_shape = {}
# Variable -> shapes
#
variable_strides = {}
# Variable -> strides
#
optimizer_time = 0.0
......@@ -154,11 +180,24 @@ class ProfileStats(object):
**kwargs - misc initializers. These should (but need not) match the
names of the class vars declared in this class.
"""
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running Theano profiler with CUDA enabled."
" Theano GPU ops execution are asynchron by default."
" So by default, the profile is useless."
" You must use set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
" synchonize the execution to get meaning full profile.")
self.apply_callcount = {}
self.output_size = {}
self.apply_time = {}
self.apply_cimpl = {}
self.outputs_size = {}
self.variable_shape = {}
self.variable_strides = {}
if flag_time_thunks is None:
self.flag_time_thunks = config.profiling.time_thunks
else:
......@@ -242,7 +281,7 @@ class ProfileStats(object):
return rval
def op_impl(self):
"""dict op -> total number of nodes"""
"""dict op -> 'C' or 'Py' depending how the op is implemented"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node in self.apply_callcount:
......@@ -520,6 +559,20 @@ class ProfileStats(object):
print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
nd_id,
str(a)[:maxlen])
if not config.profile_memory:
continue
for idx, var in enumerate(a.inputs):
sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides')
dtype = getattr(var, 'dtype', 'no dtype')
print " input %d: dtype=%s, shape=%s, strides=%s " % (
idx, dtype, sh, st)
for idx, var in enumerate(a.outputs):
sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides')
dtype = getattr(var, 'dtype', 'no dtype')
print " output %d: dtype=%s, shape=%s, strides=%s " % (
idx, dtype, sh, st)
# Same as before, this I've sacrificied some information making
# the output more readable
#print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
......@@ -555,21 +608,212 @@ class ProfileStats(object):
# The validation time is a subset of optimizer_time
assert self.validate_time < self.optimizer_time
def summary_memory(self, file, N=None):
fct_memory = {} # fgraph->dict(node->[outputs size])
fct_shapes = {} # fgraph->dict(node->[outputs shapes]))
var_mem = {} # varible->size in bytes; don't include input variables
node_mem = {} # node->total outputs size (only dense outputs)
for node in self.apply_callcount.keys():
fct_memory.setdefault(node.fgraph, {})
fct_memory[node.fgraph].setdefault(node, [])
fct_shapes.setdefault(node.fgraph, {})
fct_shapes[node.fgraph].setdefault(node, [])
sum_dense = 0
for out in node.outputs:
sh = self.variable_shape[out]
if hasattr(out.type, 'get_size'):
v = out.type.get_size(sh)
sum_dense += v
else:
v = "Unknown"
var_mem[out] = v
fct_memory[node.fgraph][node].append(v)
fct_shapes[node.fgraph][node].append(sh)
node_mem[node] = sum_dense
#Find the function that used the most of that statistic
max_sum_size = 0
max_node_memory_size = 0
max_running_max_memory_size = 0
max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0
for fgraph, nodes_mem in fct_memory.iteritems():
# Sum of the size of all variables in bytes
sum_size = sum([sum([v for v in val if not isinstance(v, str)])
for key, val in nodes_mem.iteritems()])
# Sum of the size of all variables that actually allocate
# memory (excluding views, and inplace);
node_memory_size = 0
# The sum of memory saved by returning view instead of new
# allocation
node_memory_saved_by_view = 0
# The sum of memory saved by reusing the input instead of
# new allocation
node_memory_saved_by_inplace = 0
# The memory allocated after the current apply node
running_memory_size = 0
# The maximum of running_memory_size during the function
running_max_memory_size = 0
order = fgraph.toposort()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage = []
computed, last_user = theano.gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([
input_idx
for input_idx, input in enumerate(node.inputs)
if (input in computed) and
(input not in fgraph.outputs) and
node == last_user[input]])
for node in order:
val = nodes_mem[node]
dmap = getattr(node.op, 'destroy_map', None)
vmap = getattr(node.op, 'view_map', None)
for idx, v in enumerate(val):
# TODO check the op returned a view
if dmap and idx in dmap:
node_memory_saved_by_inplace += v
# TODO check the op returned a view
elif vmap and idx in vmap:
node_memory_saved_by_view += v
elif not isinstance(v, str):
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
old_v = var_mem[node.inputs[old_s]]
if not isinstance(old_v, str):
running_memory_size -= old_v
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size, node_memory_size)
max_running_max_memory_size = max(max_running_max_memory_size,
running_max_memory_size)
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
node_memory_saved_by_view)
max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, node_memory_saved_by_inplace)
del fgraph, nodes_mem, post_thunk_old_storage, node
if len(fct_memory) > 1:
print >> file, ("Memory Profile "
"(the max between all function in that profile)")
else:
print >> file, "Memory Profile"
print >> file, "(Sparse variables are ignored)"
print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print >> file, " Max if linker=cvm (default): unknow"
print >> file, " Max if no gc (allow_gc=False): %dKB" % int(round(
max_node_memory_size / 1024.))
print >> file, " Max if linker=c|py: %dKB" % int(round(
max_running_max_memory_size / 1024.))
# print >> file, " Memory saved if view are used: %dKB" % int(round(
# max_node_memory_saved_by_view / 1024.))
# print >> file, " Memory saved if inplace op are used: %dKB" % int(
# round(max_node_memory_saved_by_inplace / 1024.))
print >> file, " Memory saved if gc is enabled (linker=c|py): %dKB" % int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.)
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
hasattr(theano.sandbox.cuda.cuda_ndarray.cuda_ndarray,
'theano_allocated')):
_, gpu_max = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
print >> file, (" Max Memory allocated on the GPU"
"(for all functions): %dKB" %
int(round(gpu_max / 1024.)))
print >> file, ""
if len(fct_memory) > 1:
print >> file, (
" This list is based on all functions in the profile")
print >> file, (" <Sum apply outputs (bytes)>"
" <Apply outputs shape>"
" <created/inplace/view>"
" <Apply node>")
print >> file, ""
items = node_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
for idx, (node, node_outputs_size) in enumerate(items[:N]):
code = ['c'] * len(node.outputs)
for out, inp in getattr(node.op, 'destroy_map', {}).iteritems():
code[out] = "i"
for out, inp in getattr(node.op, 'view_map', {}).iteritems():
code[out] = "v"
shapes = str(fct_shapes[node.fgraph][node])
if all([hasattr(out.type, 'get_size')
for out in node.outputs]):
size = "%9dB" % node_outputs_size
if node_outputs_size < config.profiling.min_memory_size:
N = idx
break
else:
size = "%10s" % "Unknown"
print >> file, ' %s %s %s %s' % (size,
shapes,
' '.join(code), node)
sum_remaining = sum(size for _, size in items[N:])
size_sum_dense = sum(node_mem.values())
if size_sum_dense == 0:
p = "0%"
else:
p = "(%.2f%%)" % (float(sum_remaining) / size_sum_dense * 100)
print >> file, (
' ... (remaining %i Apply account for %4dB/%dB (%s) of the'
' Apply with dense outputs sizes)') % (max(0, len(node_mem) - N),
sum_remaining,
size_sum_dense, p
)
print >> file, ''
if N == 0:
print >> file, (' All Apply node have outputs size that take'
' less then %dB.' %
config.profiling.min_memory_size)
print >> file, (
" <created/inplace/view> is taked from the op declaration.")
print >> file, (" Apply nodes marked 'inplace' or 'view' may"
" actually allocate memory, this is not reported"
" here. If you use DebugMode, warnings will be"
" emitted in those cases.")
print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20,
n_applies_to_print=20):
n_apply_to_print=20):
self.summary_function(file)
local_time = sum(self.apply_time.values())
if local_time > 0:
self.summary_class(file, n_ops_to_print)
self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print)
self.summary_nodes(file, n_apply_to_print)
elif self.fct_callcount > 0:
print >> file, (" No node time accumulated "
print >> file, (" No execution time accumulated "
"(hint: try config profiling.time_thunks=1)")
if config.profile_memory:
self.summary_memory(file, n_apply_to_print)
if self.optimizer_profile:
print "Optimizer Profile"
print "-----------------"
self.optimizer_profile[0].print_profile(file, self.optimizer_profile[1])
self.optimizer_profile[0].print_profile(file,
self.optimizer_profile[1])
if 0: # old code still to be ported from ProfileMode
......@@ -701,80 +945,6 @@ if 0: # old code still to be ported from ProfileMode
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#fgraph->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.fgraph,{})
fct_memory[node.fgraph][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for fgraph, nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.fgraph is fgraph][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = fgraph.toposort()
computed, last_user = gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in fgraph.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print
print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
from theano import tensor as T
......
......@@ -343,6 +343,15 @@ class PureType(object):
"""
return self.values_eq(a, b)
# def get_shape_info(self, obj):
"""
Optional function. See TensorType().get_shape_info for definition
"""
# def get_size(self, shape_info):
"""
Optional function. See TensorType().get_size for definition
"""
_nothing = """
"""
......
......@@ -5,6 +5,7 @@ VM was a better name at some point.
"""
import link
import logging
import os
import sys
import time
import warnings
......@@ -23,6 +24,9 @@ AddConfigVar('profile',
AddConfigVar('profile_optimizer',
"If VM should collect optimizer profile information",
BoolParam(False))
AddConfigVar('profile_memory',
"If VM should collect memory profile information and print it",
BoolParam(False))
def filter_vm_lazy(val):
......@@ -130,6 +134,10 @@ class VM(object):
profile.apply_cimpl[node] = hasattr(thunk, 'cthunk')
if hasattr(self, 'variable_shape'):
profile.variable_shape = self.variable_shape.copy()
profile.variable_strides = self.variable_strides.copy()
# clear the timer info out of the buffers
for i in xrange(len(self.call_times)):
self.call_times[i] = 0.0
......@@ -244,8 +252,8 @@ class Stack(VM):
self.base_apply_stack = [o.owner for o in fgraph.outputs if o.owner]
self.outputs = fgraph.outputs
self.storage_map = storage_map
self.apply_time = {}
self.outputs_size = {}
self.variable_shape = {} # Variable -> shape
self.variable_strides = {} # Variable -> strides
self.compute_map = compute_map
self.node_idx = node_idx = {}
self.callback = callback
......@@ -254,8 +262,7 @@ class Stack(VM):
for i, node in enumerate(self.nodes):
node_idx[node] = i
self.apply_time[node] = 0
self.outputs_size[node] = []
# XXX: inconsistent style - why modify node here rather
# than track destroy_dependencies with dictionary like
# storage_map?
......@@ -280,10 +287,6 @@ class Stack(VM):
if self.allow_gc and self.dependencies is None:
raise ValueError("Must set dependencies when using GC")
if config.profile:
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4,
"t64": 8, "128": 16}
atexit.register(self.atexit_print_all)
def run_thunk_of_node(self, node):
"""Run the thunk corresponding to Apply instance `node`
......@@ -319,7 +322,24 @@ class Stack(VM):
# apply_stack contains nodes
apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1
ls = []
#This record all function inputs/shared varibles and constants
for var, data in self.storage_map.iteritems():
if data[0] is None:
continue
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(data[0])
else:
sh = 'input no shape'
self.variable_shape[var] = sh
st = getattr(data[0], 'strides', 'input no strides')
if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous:
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
st = "c"
self.variable_strides[var] = st
while apply_stack:
# Make sure something happened last time round. This is
# just a safety check to make sure the op is written
......@@ -356,25 +376,27 @@ class Stack(VM):
_, dt = self.run_thunk_of_node(current_apply)
del _
if config.profile:
self.apply_time[current_apply] += dt
current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt
## Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory!
size = []
for (idx, o) in enumerate(
thunks[self.node_idx[
current_apply]].outputs):
if not hasattr(o[0], 'size'):
size.append(-1)
continue
s = o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-3:]
# KeyError here: couldn't determine
# the dtype memory size
s *= self.memory_size_map[dtype2]
size.append(s)
self.outputs_size[current_apply] = size
var = self.nodes[current_idx].outputs[idx]
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(o[0])
else:
sh = 'input no shape'
self.variable_shape[var] = sh
st = getattr(o[0], 'strides',
'input no strides')
if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous):
st = 'c'
self.variable_strides[var] = st
except Exception:
raise_with_op(current_apply)
for o in current_apply.outputs:
......@@ -429,7 +451,9 @@ class Stack(VM):
try:
requires, dt = self.run_thunk_of_node(current_apply)
self.apply_time[current_apply] += dt
current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt
except Exception:
raise_with_op(current_apply)
......@@ -444,20 +468,22 @@ class Stack(VM):
apply_stack.append(current_apply.inputs[r].owner)
else:
if config.profile:
size = []
for (idx, o) in enumerate(thunks[
self.node_idx[current_apply]].outputs):
if not hasattr(o[0], 'size'):
size.append(-1)
continue
s = o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-2:]
# KeyError here: couldn't determine the
# dtype memory size
s *= self.memory_size_map[dtype2]
size.append(s)
self.outputs_size[current_apply] = size
var = self.nodes[
self.node_idx[current_apply]].outputs[idx]
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(o[0])
else:
sh = 'input no shape'
self.variable_shape[var] = sh
st = getattr(o[0], 'strides', 'input no strides')
if (getattr(o[0], 'flags', False) and
o[0].flags.c_contiguous):
st = 'c'
self.variable_strides[var] = st
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner and
......@@ -550,6 +576,19 @@ class VM_Linker(link.LocalLinker):
:returns: self if fgraph is the first FunctionGraph that has ever been
associated to self, else, a new VM_Linker associated to fgraph.
"""
if (config.profile and
hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running Theano profiler with CUDA enabled."
" Theano GPU ops execution are asynchron by default."
" So by default, the profile is useless."
" You must use set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
" synchonize the execution to get meaning full profile.")
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
......@@ -606,7 +645,6 @@ class VM_Linker(link.LocalLinker):
# XXX if k has no clients... what is it doing in the computation?
if k.owner and k.clients:
ls = []
is_output = 0
for cl in k.clients:
if cl[0] is not 'output':
ls += cl[0].outputs
......@@ -623,9 +661,12 @@ class VM_Linker(link.LocalLinker):
pre_call_clear = [storage_map[v] for v in self.no_recycling]
if self.callback is not None:
if self.use_cloop:
logger.warn('CLoop does not support callback, using Stack VM.')
if self.callback is not None or (config.profile and config.profile_memory):
if self.use_cloop and self.callback is not None:
logger.warn('CVM does not support callback, using Stack VM.')
if self.use_cloop and config.profile_memory:
warnings.warn(
'CVM does not support memory profile, using Stack VM.')
deps = None
if self.allow_gc:
deps = self.compute_gc_dependencies(storage_map)
......
......@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
order=order))
c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX,
order=order))
f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))],
mode=theano.compile.ProfileMode())
f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])
if any([x.op.__class__.__name__ == 'Gemm' for x in
f.maker.fgraph.toposort()]):
c_impl = f.profile.apply_cimpl.values()
c_impl = [hasattr(thunk, 'cthunk')
for node, thunk in zip(f.fn.nodes, f.fn.thunks)
if node.op.__class__.__name__ == "Gemm"]
assert len(c_impl) == 1
if c_impl[0]:
impl = 'CPU (with direct Theano binding to blas)'
......
......@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
*
*/
int _outstanding_mallocs[] = {0,0};
#if COMPUTE_GPU_MEM_USED
int _allocated_size = 0;
int _max_allocated_size = 0;
const int TABLE_SIZE = 10000;
struct table_struct{
void* ptr;
......@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose)
"Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
return NULL;
}
_outstanding_mallocs[0] += (rval != NULL);
#if COMPUTE_GPU_MEM_USED
if (rval != NULL){
// Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
// Could this be what happen if size is 0?
_outstanding_mallocs[0] += 1;
#if COMPUTE_GPU_MEM_USED
_allocated_size += size;
_max_allocated_size = std::max(_max_allocated_size, _allocated_size);
for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval;
......@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose)
break;
}
}
_allocated_size += size;
#endif
#endif
}
//fprintf(stderr,
//"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
//(long)size, cudaGetErrorString(err),_allocated_size,rval);
......@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
Py_INCREF(Py_None);
return Py_None;
}
#if COMPUTE_GPU_MEM_USED
/*
* Return the size in bytes that Theano currently have allocated on the gpu.
......@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
PyObject *
GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
{
return PyLong_FromLong(_allocated_size);
PyObject* a = PyLong_FromLong(_allocated_size);
PyObject* b = PyLong_FromLong(_max_allocated_size);
PyObject* tuple = PyTuple_New(2);
PyTuple_SetItem(tuple, 0, a);
PyTuple_SetItem(tuple, 1, b);
return tuple;
}
#endif
......@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = {
(setter)CudaNdarray_set_strides,
"data pointer strides (in elements)",
NULL},
{"strides",
(getter)CudaNdarray_get_strides,
(setter)CudaNdarray_set_strides,
"data pointer strides (in elements)",
NULL},
//gpudata is needed to allow calling pycuda fct with CudaNdarray input.
{"gpudata",
(getter)CudaNdarray_get_dev_data,
......
......@@ -417,6 +417,15 @@ class CudaNdarrayType(Type):
def c_compile_args(self):
return []
def get_shape_info(self, obj):
return obj.shape
def get_size(self, shape_info):
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else: # a scalar
return numpy.dtype(self.dtype).itemsize
theano.compile.ops.expandable_types += (CudaNdarrayType,)
# Register C code for ViewOp on CudaNdarrayType
......
......@@ -423,6 +423,12 @@ class Scalar(Type):
return (4,) # explicit T given in specialization of operator=
# lines. This makes it compile with open64
def get_shape_info(self, obj):
return obj.itemsize
def get_size(self, shape_info):
return shape_info
# Register C code for ViewOp on Scalars.
theano.compile.register_view_op_c_code(
Scalar,
......@@ -460,6 +466,9 @@ class _scalar_py_operators:
# variables and Tensor variables
ndim = 0
dtype = property(lambda self: self.type.dtype)
""" The dtype of this scalar. """
#UNARY
def __abs__(self):
return abs_(self)
......
......@@ -147,6 +147,17 @@ class SparseType(gof.Type):
def is_valid_value(self, a):
return scipy.sparse.issparse(a) and (a.format == self.format)
def get_shape_info(self, obj):
obj = self.filter(obj)
assert obj.indices.dtype == 'int32'
assert obj.indptr.dtype == 'int32'
return (obj.shape, obj.data.size,
obj.indices.size, obj.indptr.size, obj.nnz)
def get_size(self, shape_info):
return (shape_info[1] * numpy.dtype(self.dtype).itemsize +
(shape_info[2] + shape_info[3]) * numpy.dtype('int32').itemsize)
# Register SparseType's C code for ViewOp.
theano.compile.register_view_op_c_code(
SparseType,
......
......@@ -1198,6 +1198,38 @@ class TensorType(Type):
"""
return numpy.zeros(shape, dtype=self.dtype)
def get_shape_info(self, obj):
"""Return the information needed to compute the memory size of obj.
The memory size is only the data, so this exclude the container.
For an ndarray, this is the data, but not the ndarray object and
others data structures as shape and strides.
get_shape_info() and get_size() work in tendem for the memory profiler.
get_shape_info() is called during the execution of the function.
So it is better that it is not too slow.
get_size() will be called with the output of this function
when printing the memory profile.
:param obj: The object that this Type represent during execution
:return: Python object that self.get_size() understand
"""
return obj.shape
def get_size(self, shape_info):
""" Number of bytes taken by the object represented by shape_info
:param shape_info: the output of the call to get_shape_info()
:return: the number of bytes taken by the object described in
shape_info.
"""
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else: # a scalar
return numpy.dtype(self.dtype).itemsize
theano.compile.ops.expandable_types += (TensorType,)
# Register TensorType C code for ViewOp.
......@@ -5158,8 +5190,8 @@ def batched_dot(x, y):
iterating over the first dimension using scan.
Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
Example:
>>> first = T.tensor3('first')
>>> second = T.tensor3('second')
>>> first = tensor.tensor3('first')
>>> second = tensor.tensor3('second')
>>> result = batched_dot(first, second)
:note: This is a subset of numpy.einsum, but we do not provide it for now.
But numpy einsum is slower than dot or tensordot:
......
......@@ -56,6 +56,25 @@ class RandomStateType(gof.Type):
return False
return True
def get_shape_info(self, obj):
return None
def get_size(self, shape_info):
# The size is the data, that have constant size.
state = numpy.random.RandomState().get_state()
size = 0
for elem in state:
if isinstance(elem, str):
size += len(elem)
elif isinstance(elem, numpy.ndarray):
size += elem.size * elem.itemsize
elif isinstance(elem, int):
size += numpy.dtype("int").itemsize
elif isinstance(elem, float):
size += numpy.dtype("float").itemsize
else:
raise NotImplementedError()
return size
# Register RandomStateType's C code for ViewOp.
theano.compile.register_view_op_c_code(
RandomStateType,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论