提交 91a31814 authored 作者: lamblin's avatar lamblin

Merge pull request #1277 from nouiz/memory_prof

Memory profiler in the new profiler
...@@ -106,6 +106,38 @@ default values. ...@@ -106,6 +106,38 @@ default values.
*Default:* ``id(self)`` *Default:* ``id(self)``
.. method:: get_shape_info(obj)
Optional. Only needed to profile the memory of this Type of object
Return the information needed to compute the memory size of obj.
The memory size is only the data, so this exclude the container.
For an ndarray, this is the data, but not the ndarray object and
others data structures as shape and strides.
get_shape_info() and get_size() work in tendem for the memory profiler.
get_shape_info() is called during the execution of the function.
So it is better that it is not too slow.
get_size() will be called with the output of this function
when printing the memory profile.
:param obj: The object that this Type represent during execution
:return: Python object that self.get_size() understand
.. method:: get_size(shape_info)
Number of bytes taken by the object represented by shape_info
Optional. Only needed to profile the memory of this Type of object
:param shape_info: the output of the call to get_shape_info()
:return: the number of bytes taken by the object described in
shape_info.
"""
For each method, the *default* is what ``Type`` defines For each method, the *default* is what ``Type`` defines
for you. So, if you create an instance of ``Type`` or an for you. So, if you create an instance of ``Type`` or an
instance of a subclass of ``Type``, you instance of a subclass of ``Type``, you
......
...@@ -273,6 +273,15 @@ import theano and print the config variable, as in: ...@@ -273,6 +273,15 @@ import theano and print the config variable, as in:
Do the vm/cvm linkers profile the execution of Theano functions? Do the vm/cvm linkers profile the execution of Theano functions?
.. attribute:: profile_memory
Bool value: either True or False
Default False
Do the vm/cvm linkers profile the memory of Theano functions get printed?
It only work when profile=True.
.. attribute:: profile_optimizer .. attribute:: profile_optimizer
Bool value: either True or False Bool value: either True or False
...@@ -280,6 +289,26 @@ import theano and print the config variable, as in: ...@@ -280,6 +289,26 @@ import theano and print the config variable, as in:
Default False Default False
Do the vm/cvm linkers profile the optimization phase when compiling a Theano function? Do the vm/cvm linkers profile the optimization phase when compiling a Theano function?
It only work when profile=True.
.. attribute:: profiling.n_apply
Positive int value, default: 20.
The number of apply node to print in the profiler output
.. attribute:: profiling.n_ops
Positive int value, default: 20.
The number of ops to print in the profiler output
.. attribute:: profiling.min_memory_size
Positive int value, default: 1024.
For the memory profile, do not print apply nodes if the size
of their outputs (in bytes) is lower then this.
.. attribute:: config.lib.amdlibm .. attribute:: config.lib.amdlibm
......
...@@ -60,7 +60,7 @@ from theano.compile import \ ...@@ -60,7 +60,7 @@ from theano.compile import \
FunctionMaker, function, OpFromGraph, \ FunctionMaker, function, OpFromGraph, \
Component, External, Member, Method, \ Component, External, Member, Method, \
Composite, ComponentList, ComponentDict, Module, \ Composite, ComponentList, ComponentDict, Module, \
ProfileMode, \ ProfileMode, ProfileStats, \
Param, shared Param, shared
from theano.misc.safe_asarray import _asarray from theano.misc.safe_asarray import _asarray
......
...@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode ...@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode
from theano.compile.monitormode import MonitorMode from theano.compile.monitormode import MonitorMode
from theano.compile.profiling import ProfileStats, ScanProfileStats
from theano.compile.profilemode import ProfileMode from theano.compile.profilemode import ProfileMode
from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable from theano.compile.sharedvalue import (shared, shared_constructor,
SharedVariable)
from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
from theano.compile.function import function from theano.compile.function import function
import atexit import atexit
import copy import copy
import os
import time import time
import theano
from theano.gof.link import WrapLinker from theano.gof.link import WrapLinker
from theano.compile.mode import (Mode, register_mode, from theano.compile.mode import (Mode, register_mode,
predefined_modes, predefined_linkers, predefined_modes, predefined_linkers,
...@@ -42,6 +44,18 @@ class Profile_Maker(FunctionMaker): ...@@ -42,6 +44,18 @@ class Profile_Maker(FunctionMaker):
def create(self, input_storage=None, trustme=False): def create(self, input_storage=None, trustme=False):
ret = super(Profile_Maker, self).create(input_storage, trustme) ret = super(Profile_Maker, self).create(input_storage, trustme)
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running Theano profiler with CUDA enabled."
" Theano GPU ops execution are asynchron by default."
" So by default, the profile is useless."
" You must use set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
" synchonize the execution to get meaning full profile.")
# create a function-specific storage container for profiling info # create a function-specific storage container for profiling info
profile = ProfileStats(atexit_print=False) profile = ProfileStats(atexit_print=False)
self.mode.profile_stats[ret] = profile self.mode.profile_stats[ret] = profile
...@@ -50,7 +64,6 @@ class Profile_Maker(FunctionMaker): ...@@ -50,7 +64,6 @@ class Profile_Maker(FunctionMaker):
#initialize the timers #initialize the timers
for i, node in enumerate(ret.maker.fgraph.toposort()): for i, node in enumerate(ret.maker.fgraph.toposort()):
profile.apply_time[node] = 0.0 profile.apply_time[node] = 0.0
profile.outputs_size[node] = [0.0] * len(node.outputs)
# a thunk_group is a list of the thunks from each linker # a thunk_group is a list of the thunks from each linker
# corresponding to the i'th position in the toposort. # corresponding to the i'th position in the toposort.
...@@ -70,13 +83,13 @@ class Profile_Maker(FunctionMaker): ...@@ -70,13 +83,13 @@ class Profile_Maker(FunctionMaker):
def new_fn(): def new_fn():
self.mode.apply_time = self.mode.profile_stats[ret].apply_time self.mode.apply_time = self.mode.profile_stats[ret].apply_time
self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size self.mode.variable_shape = self.mode.profile_stats[ret].variable_shape
ret_fn() ret_fn()
# delete the old apply_time variable # delete the old apply_time variable
# because it doesn't mean the same thing anymore. # because it doesn't mean the same thing anymore.
# This prevents old code from looking like it still works. # This prevents old code from looking like it still works.
del self.mode.apply_time del self.mode.apply_time
del self.mode.outputs_size del self.mode.variable_shape
ret.fn = new_fn ret.fn = new_fn
...@@ -165,34 +178,10 @@ class ProfileMode(Mode): ...@@ -165,34 +178,10 @@ class ProfileMode(Mode):
t0 = time.time() t0 = time.time()
th() th()
dt = time.time() - t0 dt = time.time() - t0
size = [] for var, data in zip(node.outputs, th.outputs):
for o in th.outputs: sh = getattr(data[0], 'shape', 'input no shape')
if not hasattr(o[0], 'size'): self.variable_shape[var] = sh
#if the output type don't have a size attribute, set -1
#to signify we can't evaluate it.
#This happen at least for mtrand.RandomState type(in numpy)
size.append(-1)
continue
s = o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for
#CudaNdarray
dtype = str(o[0].dtype)
dtype2 = dtype[-2:]
if dtype2 == '32':
s *= 4
elif dtype2 == '64':
s *= 8
elif dtype2 == '16':
s *= 2
elif dtype[-1] == '8':
s *= 1
elif dtype[-3:] == '128':
s *= 16
else:
raise Exception("Can't determine the memory size of dtype",
o[0].dtype)
size.append(s)
self.outputs_size[node] = size
self.apply_time[node] += max(dt, 1e-14) self.apply_time[node] += max(dt, 1e-14)
self.provided_linker = linker self.provided_linker = linker
...@@ -247,15 +236,14 @@ class ProfileMode(Mode): ...@@ -247,15 +236,14 @@ class ProfileMode(Mode):
print i, n print i, n
apply_cimpl = {} apply_cimpl = {}
outputs_size = {}
for fn, ps in self.profile_stats.items(): for fn, ps in self.profile_stats.items():
apply_cimpl.update(ps.apply_cimpl) apply_cimpl.update(ps.apply_cimpl)
message = self.message message = self.message
outputs_size = {} variable_shape = {}
for fn, ps in self.profile_stats.items(): for fn, ps in self.profile_stats.items():
outputs_size.update(ps.outputs_size) variable_shape.update(ps.variable_shape)
other_time = dict( other_time = dict(
linker_time=sum( linker_time=sum(
...@@ -265,7 +253,7 @@ class ProfileMode(Mode): ...@@ -265,7 +253,7 @@ class ProfileMode(Mode):
self.print_summary_("print_summary", self.print_summary_("print_summary",
compile_time, fct_call_time, fct_call, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size, apply_time, apply_cimpl, message, variable_shape,
self.local_time, other_time, self.local_time, other_time,
**kwargs) **kwargs)
...@@ -300,19 +288,19 @@ class ProfileMode(Mode): ...@@ -300,19 +288,19 @@ class ProfileMode(Mode):
apply_time = diff_dict(self.apply_time, other.apply_time) apply_time = diff_dict(self.apply_time, other.apply_time)
apply_cimpl = self.apply_cimpl and other.apply_cimpl apply_cimpl = self.apply_cimpl and other.apply_cimpl
message = self.message message = self.message
outputs_size = diff_dict(self.outputs_size, other.outputs_size) variable_shape = diff_dict(self.variable_shape, other.variable_shape)
other_time = {'linker_time': self.linker_time - other.linker_time, other_time = {'linker_time': self.linker_time - other.linker_time,
'optimizer_time': self.optimizer_time - 'optimizer_time': self.optimizer_time -
other.optimizer_time} other.optimizer_time}
self.print_summary_("print_diff_summary", compile_time, self.print_summary_("print_diff_summary", compile_time,
fct_call_time, fct_call, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size, apply_time, apply_cimpl, message, variable_shape,
print_apply=False, other_time=other_time, print_apply=False, other_time=other_time,
**kwargs) **kwargs)
@staticmethod @staticmethod
def print_summary_(fct_name, compile_time, fct_call_time, fct_call, def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size, apply_time, apply_cimpl, message, variable_shape,
local_time, other_time, local_time, other_time,
n_apply_to_print=config.ProfileMode.n_apply_to_print, n_apply_to_print=config.ProfileMode.n_apply_to_print,
n_ops_to_print=config.ProfileMode.n_ops_to_print, n_ops_to_print=config.ProfileMode.n_ops_to_print,
...@@ -329,6 +317,10 @@ class ProfileMode(Mode): ...@@ -329,6 +317,10 @@ class ProfileMode(Mode):
whose outputs memory size is lower then that. whose outputs memory size is lower then that.
""" """
print "ProfileMode is deprecated! Use the new profiler."
print " The Theano flags to enable it ise: profile=True"
print " The Theano flags for the memory profile to it is: profile_memory=True"
total_time = time.time() - import_time total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values()) total_fct_time = sum(fct_call_time.values())
total_fct_call = sum(fct_call.values()) total_fct_call = sum(fct_call.values())
...@@ -502,96 +494,17 @@ class ProfileMode(Mode): ...@@ -502,96 +494,17 @@ class ProfileMode(Mode):
print '(*) Op is running a c implementation' print '(*) Op is running a c implementation'
for printer in profiler_printers: for printer in profiler_printers:
printer(fct_name, compile_time, fct_call_time, fct_call, printer(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size, apply_time, apply_cimpl, message, variable_shape,
other_time) other_time)
if not outputs_size: if not variable_shape:
print """\nProfile of Theano intermediate memory disabled. print """\nProfile of Theano intermediate memory disabled.
To enabled, put the Theano flag ProfileMode.profile_memory to True.""" To enabled, put the Theano flag ProfileMode.profile_memory to True."""
else: else:
fct_memory={}#fgraph->dict(node->(outputs size)) print """
var_mem = {} The memory profile in ProfileMode is removed!
for node, val in outputs_size.items(): Use the new profiler. Use the Theano flags
fct_memory.setdefault(node.fgraph, {}) profile=True,profile_memory=True to enable it."""
fct_memory[node.fgraph][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
print "(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)"
nb_skipped = 0
for fgraph, nodes_mem in fct_memory.iteritems():
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
if size_sum < min_memory_size:
nb_skipped += 1
continue
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.fgraph is fgraph][0].name
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = self.linker.schedule(fgraph)
computed, last_user = gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in fgraph.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
n_apply_printed = 0
for key,val in items[:n_apply_to_print]:
if sum(val) < min_memory_size:
break
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
n_apply_printed += 1
print ' ... (remaining %i Apply account for %.2f%%(%d bytes) of the total intermediate memory used)'\
%(max(0, len(nodes_mem)-n_apply_printed),
sum(sum(val) for key, val in items[n_apply_printed:])/float(size_sum),
sum(sum(val) for key, val in items[n_apply_printed:]))
if nb_skipped > 0:
print ' We skipped %d theano function(s). Each of them used less then %dB(theano flags ProfileMode.min_memory_size) of total intermediate memory size'%(nb_skipped, min_memory_size)
print print
print """Here are tips to potentially make your code run faster print """Here are tips to potentially make your code run faster
......
...@@ -16,13 +16,15 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>" ...@@ -16,13 +16,15 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
import atexit import atexit
import copy import copy
import os
import sys import sys
import time import time
import numpy import numpy
import theano import theano
from theano.configparser import AddConfigVar, BoolParam from theano.configparser import AddConfigVar, BoolParam, IntParam
import_time = time.time() import_time = time.time()
config = theano.config config = theano.config
...@@ -32,7 +34,23 @@ _atexit_print_file = sys.stderr ...@@ -32,7 +34,23 @@ _atexit_print_file = sys.stderr
AddConfigVar('profiling.time_thunks', AddConfigVar('profiling.time_thunks',
"""Time individual thunks when profiling""", """Time individual thunks when profiling""",
BoolParam(True)) BoolParam(True))
AddConfigVar('profiling.n_apply',
"Number of apply instances to print by default",
IntParam(20, lambda i: i > 0),
in_c_key=False)
AddConfigVar('profiling.n_ops',
"Number of ops to print by default",
IntParam(20, lambda i: i > 0),
in_c_key=False)
AddConfigVar('profiling.min_memory_size',
"""For the memory profile, do not print apply nodes if the size
of their outputs (in bytes) is lower then this threshold""",
IntParam(1024, lambda i: i >= 0),
in_c_key=False)
def _atexit_print_fn(): def _atexit_print_fn():
...@@ -41,7 +59,9 @@ def _atexit_print_fn(): ...@@ -41,7 +59,9 @@ def _atexit_print_fn():
printed = 0 printed = 0
for ps in _atexit_print_list: for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0: if ps.fct_callcount or ps.compile_time > 0:
ps.summary(file=_atexit_print_file) ps.summary(file=_atexit_print_file,
n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply)
printed += 1 printed += 1
else: else:
print 'Skipping empty Profile' print 'Skipping empty Profile'
...@@ -59,7 +79,7 @@ def _atexit_print_fn(): ...@@ -59,7 +79,7 @@ def _atexit_print_fn():
#merge dictonary #merge dictonary
for attr in ["apply_time", "apply_callcount", for attr in ["apply_time", "apply_callcount",
"apply_cimpl", "outputs_size"]: "apply_cimpl", "variable_shape", "variable_strides"]:
cum_attr = getattr(cum, attr) cum_attr = getattr(cum, attr)
for key, val in getattr(ps, attr).iteritems(): for key, val in getattr(ps, attr).iteritems():
assert key not in cum_attr assert key not in cum_attr
...@@ -73,7 +93,9 @@ def _atexit_print_fn(): ...@@ -73,7 +93,9 @@ def _atexit_print_fn():
else: else:
cum.optimizer_profile = None cum.optimizer_profile = None
cum.summary(file=_atexit_print_file) cum.summary(file=_atexit_print_file,
n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply)
atexit.register(_atexit_print_fn) atexit.register(_atexit_print_fn)
...@@ -125,8 +147,12 @@ class ProfileStats(object): ...@@ -125,8 +147,12 @@ class ProfileStats(object):
# pretty string to print in summary, to identify this output # pretty string to print in summary, to identify this output
# #
outputs_size = None variable_shape = {}
# node -> size of allocated output # Variable -> shapes
#
variable_strides = {}
# Variable -> strides
# #
optimizer_time = 0.0 optimizer_time = 0.0
...@@ -154,11 +180,24 @@ class ProfileStats(object): ...@@ -154,11 +180,24 @@ class ProfileStats(object):
**kwargs - misc initializers. These should (but need not) match the **kwargs - misc initializers. These should (but need not) match the
names of the class vars declared in this class. names of the class vars declared in this class.
""" """
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running Theano profiler with CUDA enabled."
" Theano GPU ops execution are asynchron by default."
" So by default, the profile is useless."
" You must use set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
" synchonize the execution to get meaning full profile.")
self.apply_callcount = {} self.apply_callcount = {}
self.output_size = {} self.output_size = {}
self.apply_time = {} self.apply_time = {}
self.apply_cimpl = {} self.apply_cimpl = {}
self.outputs_size = {} self.variable_shape = {}
self.variable_strides = {}
if flag_time_thunks is None: if flag_time_thunks is None:
self.flag_time_thunks = config.profiling.time_thunks self.flag_time_thunks = config.profiling.time_thunks
else: else:
...@@ -242,7 +281,7 @@ class ProfileStats(object): ...@@ -242,7 +281,7 @@ class ProfileStats(object):
return rval return rval
def op_impl(self): def op_impl(self):
"""dict op -> total number of nodes""" """dict op -> 'C' or 'Py' depending how the op is implemented"""
# timing is stored by node, we compute timing by Op on demand # timing is stored by node, we compute timing by Op on demand
rval = {} rval = {}
for node in self.apply_callcount: for node in self.apply_callcount:
...@@ -520,6 +559,20 @@ class ProfileStats(object): ...@@ -520,6 +559,20 @@ class ProfileStats(object):
print >> file, format_str %(f, ftot, t, t / nb_call, nb_call, print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
nd_id, nd_id,
str(a)[:maxlen]) str(a)[:maxlen])
if not config.profile_memory:
continue
for idx, var in enumerate(a.inputs):
sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides')
dtype = getattr(var, 'dtype', 'no dtype')
print " input %d: dtype=%s, shape=%s, strides=%s " % (
idx, dtype, sh, st)
for idx, var in enumerate(a.outputs):
sh = self.variable_shape.get(var, 'no shape')
st = self.variable_strides.get(var, 'no strides')
dtype = getattr(var, 'dtype', 'no dtype')
print " output %d: dtype=%s, shape=%s, strides=%s " % (
idx, dtype, sh, st)
# Same as before, this I've sacrificied some information making # Same as before, this I've sacrificied some information making
# the output more readable # the output more readable
#print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%( #print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
...@@ -555,21 +608,212 @@ class ProfileStats(object): ...@@ -555,21 +608,212 @@ class ProfileStats(object):
# The validation time is a subset of optimizer_time # The validation time is a subset of optimizer_time
assert self.validate_time < self.optimizer_time assert self.validate_time < self.optimizer_time
def summary_memory(self, file, N=None):
fct_memory = {} # fgraph->dict(node->[outputs size])
fct_shapes = {} # fgraph->dict(node->[outputs shapes]))
var_mem = {} # varible->size in bytes; don't include input variables
node_mem = {} # node->total outputs size (only dense outputs)
for node in self.apply_callcount.keys():
fct_memory.setdefault(node.fgraph, {})
fct_memory[node.fgraph].setdefault(node, [])
fct_shapes.setdefault(node.fgraph, {})
fct_shapes[node.fgraph].setdefault(node, [])
sum_dense = 0
for out in node.outputs:
sh = self.variable_shape[out]
if hasattr(out.type, 'get_size'):
v = out.type.get_size(sh)
sum_dense += v
else:
v = "Unknown"
var_mem[out] = v
fct_memory[node.fgraph][node].append(v)
fct_shapes[node.fgraph][node].append(sh)
node_mem[node] = sum_dense
#Find the function that used the most of that statistic
max_sum_size = 0
max_node_memory_size = 0
max_running_max_memory_size = 0
max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0
for fgraph, nodes_mem in fct_memory.iteritems():
# Sum of the size of all variables in bytes
sum_size = sum([sum([v for v in val if not isinstance(v, str)])
for key, val in nodes_mem.iteritems()])
# Sum of the size of all variables that actually allocate
# memory (excluding views, and inplace);
node_memory_size = 0
# The sum of memory saved by returning view instead of new
# allocation
node_memory_saved_by_view = 0
# The sum of memory saved by reusing the input instead of
# new allocation
node_memory_saved_by_inplace = 0
# The memory allocated after the current apply node
running_memory_size = 0
# The maximum of running_memory_size during the function
running_max_memory_size = 0
order = fgraph.toposort()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage = []
computed, last_user = theano.gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([
input_idx
for input_idx, input in enumerate(node.inputs)
if (input in computed) and
(input not in fgraph.outputs) and
node == last_user[input]])
for node in order:
val = nodes_mem[node]
dmap = getattr(node.op, 'destroy_map', None)
vmap = getattr(node.op, 'view_map', None)
for idx, v in enumerate(val):
# TODO check the op returned a view
if dmap and idx in dmap:
node_memory_saved_by_inplace += v
# TODO check the op returned a view
elif vmap and idx in vmap:
node_memory_saved_by_view += v
elif not isinstance(v, str):
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
old_v = var_mem[node.inputs[old_s]]
if not isinstance(old_v, str):
running_memory_size -= old_v
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size, node_memory_size)
max_running_max_memory_size = max(max_running_max_memory_size,
running_max_memory_size)
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
node_memory_saved_by_view)
max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, node_memory_saved_by_inplace)
del fgraph, nodes_mem, post_thunk_old_storage, node
if len(fct_memory) > 1:
print >> file, ("Memory Profile "
"(the max between all function in that profile)")
else:
print >> file, "Memory Profile"
print >> file, "(Sparse variables are ignored)"
print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print >> file, " Max if linker=cvm (default): unknow"
print >> file, " Max if no gc (allow_gc=False): %dKB" % int(round(
max_node_memory_size / 1024.))
print >> file, " Max if linker=c|py: %dKB" % int(round(
max_running_max_memory_size / 1024.))
# print >> file, " Memory saved if view are used: %dKB" % int(round(
# max_node_memory_saved_by_view / 1024.))
# print >> file, " Memory saved if inplace op are used: %dKB" % int(
# round(max_node_memory_saved_by_inplace / 1024.))
print >> file, " Memory saved if gc is enabled (linker=c|py): %dKB" % int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.)
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
hasattr(theano.sandbox.cuda.cuda_ndarray.cuda_ndarray,
'theano_allocated')):
_, gpu_max = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
print >> file, (" Max Memory allocated on the GPU"
"(for all functions): %dKB" %
int(round(gpu_max / 1024.)))
print >> file, ""
if len(fct_memory) > 1:
print >> file, (
" This list is based on all functions in the profile")
print >> file, (" <Sum apply outputs (bytes)>"
" <Apply outputs shape>"
" <created/inplace/view>"
" <Apply node>")
print >> file, ""
items = node_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
for idx, (node, node_outputs_size) in enumerate(items[:N]):
code = ['c'] * len(node.outputs)
for out, inp in getattr(node.op, 'destroy_map', {}).iteritems():
code[out] = "i"
for out, inp in getattr(node.op, 'view_map', {}).iteritems():
code[out] = "v"
shapes = str(fct_shapes[node.fgraph][node])
if all([hasattr(out.type, 'get_size')
for out in node.outputs]):
size = "%9dB" % node_outputs_size
if node_outputs_size < config.profiling.min_memory_size:
N = idx
break
else:
size = "%10s" % "Unknown"
print >> file, ' %s %s %s %s' % (size,
shapes,
' '.join(code), node)
sum_remaining = sum(size for _, size in items[N:])
size_sum_dense = sum(node_mem.values())
if size_sum_dense == 0:
p = "0%"
else:
p = "(%.2f%%)" % (float(sum_remaining) / size_sum_dense * 100)
print >> file, (
' ... (remaining %i Apply account for %4dB/%dB (%s) of the'
' Apply with dense outputs sizes)') % (max(0, len(node_mem) - N),
sum_remaining,
size_sum_dense, p
)
print >> file, ''
if N == 0:
print >> file, (' All Apply node have outputs size that take'
' less then %dB.' %
config.profiling.min_memory_size)
print >> file, (
" <created/inplace/view> is taked from the op declaration.")
print >> file, (" Apply nodes marked 'inplace' or 'view' may"
" actually allocate memory, this is not reported"
" here. If you use DebugMode, warnings will be"
" emitted in those cases.")
print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20, def summary(self, file=sys.stderr, n_ops_to_print=20,
n_applies_to_print=20): n_apply_to_print=20):
self.summary_function(file) self.summary_function(file)
local_time = sum(self.apply_time.values()) local_time = sum(self.apply_time.values())
if local_time > 0: if local_time > 0:
self.summary_class(file, n_ops_to_print) self.summary_class(file, n_ops_to_print)
self.summary_ops(file, n_ops_to_print) self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print) self.summary_nodes(file, n_apply_to_print)
elif self.fct_callcount > 0: elif self.fct_callcount > 0:
print >> file, (" No node time accumulated " print >> file, (" No execution time accumulated "
"(hint: try config profiling.time_thunks=1)") "(hint: try config profiling.time_thunks=1)")
if config.profile_memory:
self.summary_memory(file, n_apply_to_print)
if self.optimizer_profile: if self.optimizer_profile:
print "Optimizer Profile" print "Optimizer Profile"
print "-----------------" print "-----------------"
self.optimizer_profile[0].print_profile(file, self.optimizer_profile[1]) self.optimizer_profile[0].print_profile(file,
self.optimizer_profile[1])
if 0: # old code still to be ported from ProfileMode if 0: # old code still to be ported from ProfileMode
...@@ -701,80 +945,6 @@ if 0: # old code still to be ported from ProfileMode ...@@ -701,80 +945,6 @@ if 0: # old code still to be ported from ProfileMode
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64': if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print fct.name, i.name, i.type, i print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#fgraph->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.fgraph,{})
fct_memory[node.fgraph][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for fgraph, nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.fgraph is fgraph][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = fgraph.toposort()
computed, last_user = gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in fgraph.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print print
print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup." print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
from theano import tensor as T from theano import tensor as T
......
...@@ -343,6 +343,15 @@ class PureType(object): ...@@ -343,6 +343,15 @@ class PureType(object):
""" """
return self.values_eq(a, b) return self.values_eq(a, b)
# def get_shape_info(self, obj):
"""
Optional function. See TensorType().get_shape_info for definition
"""
# def get_size(self, shape_info):
"""
Optional function. See TensorType().get_size for definition
"""
_nothing = """ _nothing = """
""" """
......
...@@ -5,6 +5,7 @@ VM was a better name at some point. ...@@ -5,6 +5,7 @@ VM was a better name at some point.
""" """
import link import link
import logging import logging
import os
import sys import sys
import time import time
import warnings import warnings
...@@ -23,6 +24,9 @@ AddConfigVar('profile', ...@@ -23,6 +24,9 @@ AddConfigVar('profile',
AddConfigVar('profile_optimizer', AddConfigVar('profile_optimizer',
"If VM should collect optimizer profile information", "If VM should collect optimizer profile information",
BoolParam(False)) BoolParam(False))
AddConfigVar('profile_memory',
"If VM should collect memory profile information and print it",
BoolParam(False))
def filter_vm_lazy(val): def filter_vm_lazy(val):
...@@ -130,6 +134,10 @@ class VM(object): ...@@ -130,6 +134,10 @@ class VM(object):
profile.apply_cimpl[node] = hasattr(thunk, 'cthunk') profile.apply_cimpl[node] = hasattr(thunk, 'cthunk')
if hasattr(self, 'variable_shape'):
profile.variable_shape = self.variable_shape.copy()
profile.variable_strides = self.variable_strides.copy()
# clear the timer info out of the buffers # clear the timer info out of the buffers
for i in xrange(len(self.call_times)): for i in xrange(len(self.call_times)):
self.call_times[i] = 0.0 self.call_times[i] = 0.0
...@@ -244,8 +252,8 @@ class Stack(VM): ...@@ -244,8 +252,8 @@ class Stack(VM):
self.base_apply_stack = [o.owner for o in fgraph.outputs if o.owner] self.base_apply_stack = [o.owner for o in fgraph.outputs if o.owner]
self.outputs = fgraph.outputs self.outputs = fgraph.outputs
self.storage_map = storage_map self.storage_map = storage_map
self.apply_time = {} self.variable_shape = {} # Variable -> shape
self.outputs_size = {} self.variable_strides = {} # Variable -> strides
self.compute_map = compute_map self.compute_map = compute_map
self.node_idx = node_idx = {} self.node_idx = node_idx = {}
self.callback = callback self.callback = callback
...@@ -254,8 +262,7 @@ class Stack(VM): ...@@ -254,8 +262,7 @@ class Stack(VM):
for i, node in enumerate(self.nodes): for i, node in enumerate(self.nodes):
node_idx[node] = i node_idx[node] = i
self.apply_time[node] = 0
self.outputs_size[node] = []
# XXX: inconsistent style - why modify node here rather # XXX: inconsistent style - why modify node here rather
# than track destroy_dependencies with dictionary like # than track destroy_dependencies with dictionary like
# storage_map? # storage_map?
...@@ -280,10 +287,6 @@ class Stack(VM): ...@@ -280,10 +287,6 @@ class Stack(VM):
if self.allow_gc and self.dependencies is None: if self.allow_gc and self.dependencies is None:
raise ValueError("Must set dependencies when using GC") raise ValueError("Must set dependencies when using GC")
if config.profile:
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4,
"t64": 8, "128": 16}
atexit.register(self.atexit_print_all)
def run_thunk_of_node(self, node): def run_thunk_of_node(self, node):
"""Run the thunk corresponding to Apply instance `node` """Run the thunk corresponding to Apply instance `node`
...@@ -319,7 +322,24 @@ class Stack(VM): ...@@ -319,7 +322,24 @@ class Stack(VM):
# apply_stack contains nodes # apply_stack contains nodes
apply_stack = list(self.base_apply_stack) apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1 last_apply_stack_len = -1
ls = []
#This record all function inputs/shared varibles and constants
for var, data in self.storage_map.iteritems():
if data[0] is None:
continue
if hasattr(var.type, 'get_shape_info'):
sh = var.type.get_shape_info(data[0])
else:
sh = 'input no shape'
self.variable_shape[var] = sh
st = getattr(data[0], 'strides', 'input no strides')
if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous:
st = 'c'
elif (hasattr(data[0], 'is_c_contiguous') and
data[0].is_c_contiguous()):
st = "c"
self.variable_strides[var] = st
while apply_stack: while apply_stack:
# Make sure something happened last time round. This is # Make sure something happened last time round. This is
# just a safety check to make sure the op is written # just a safety check to make sure the op is written
...@@ -356,25 +376,27 @@ class Stack(VM): ...@@ -356,25 +376,27 @@ class Stack(VM):
_, dt = self.run_thunk_of_node(current_apply) _, dt = self.run_thunk_of_node(current_apply)
del _ del _
if config.profile: if config.profile:
self.apply_time[current_apply] += dt current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt
## Computing the memory footprint of the the op ## Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace # ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory! # you don't actually ask for more memory!
size = []
for (idx, o) in enumerate( for (idx, o) in enumerate(
thunks[self.node_idx[ thunks[self.node_idx[
current_apply]].outputs): current_apply]].outputs):
if not hasattr(o[0], 'size'): var = self.nodes[current_idx].outputs[idx]
size.append(-1) if hasattr(var.type, 'get_shape_info'):
continue sh = var.type.get_shape_info(o[0])
s = o[0].size else:
dtype = str(o[0].dtype) sh = 'input no shape'
dtype2 = dtype[-3:] self.variable_shape[var] = sh
# KeyError here: couldn't determine st = getattr(o[0], 'strides',
# the dtype memory size 'input no strides')
s *= self.memory_size_map[dtype2] if (getattr(o[0], 'flags', False) and
size.append(s) o[0].flags.c_contiguous):
self.outputs_size[current_apply] = size st = 'c'
self.variable_strides[var] = st
except Exception: except Exception:
raise_with_op(current_apply) raise_with_op(current_apply)
for o in current_apply.outputs: for o in current_apply.outputs:
...@@ -429,7 +451,9 @@ class Stack(VM): ...@@ -429,7 +451,9 @@ class Stack(VM):
try: try:
requires, dt = self.run_thunk_of_node(current_apply) requires, dt = self.run_thunk_of_node(current_apply)
self.apply_time[current_apply] += dt current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt
except Exception: except Exception:
raise_with_op(current_apply) raise_with_op(current_apply)
...@@ -444,20 +468,22 @@ class Stack(VM): ...@@ -444,20 +468,22 @@ class Stack(VM):
apply_stack.append(current_apply.inputs[r].owner) apply_stack.append(current_apply.inputs[r].owner)
else: else:
if config.profile: if config.profile:
size = []
for (idx, o) in enumerate(thunks[ for (idx, o) in enumerate(thunks[
self.node_idx[current_apply]].outputs): self.node_idx[current_apply]].outputs):
if not hasattr(o[0], 'size'): var = self.nodes[
size.append(-1) self.node_idx[current_apply]].outputs[idx]
continue
s = o[0].size if hasattr(var.type, 'get_shape_info'):
dtype = str(o[0].dtype) sh = var.type.get_shape_info(o[0])
dtype2 = dtype[-2:] else:
# KeyError here: couldn't determine the sh = 'input no shape'
# dtype memory size self.variable_shape[var] = sh
s *= self.memory_size_map[dtype2] st = getattr(o[0], 'strides', 'input no strides')
size.append(s) if (getattr(o[0], 'flags', False) and
self.outputs_size[current_apply] = size o[0].flags.c_contiguous):
st = 'c'
self.variable_strides[var] = st
if self.allow_gc: if self.allow_gc:
for i in current_apply.inputs: for i in current_apply.inputs:
if (dependencies[i] and i.owner and if (dependencies[i] and i.owner and
...@@ -550,6 +576,19 @@ class VM_Linker(link.LocalLinker): ...@@ -550,6 +576,19 @@ class VM_Linker(link.LocalLinker):
:returns: self if fgraph is the first FunctionGraph that has ever been :returns: self if fgraph is the first FunctionGraph that has ever been
associated to self, else, a new VM_Linker associated to fgraph. associated to self, else, a new VM_Linker associated to fgraph.
""" """
if (config.profile and
hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
theano.sandbox.cuda.cuda_enabled):
if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
raise Exception(
"You are running Theano profiler with CUDA enabled."
" Theano GPU ops execution are asynchron by default."
" So by default, the profile is useless."
" You must use set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
" synchonize the execution to get meaning full profile.")
if no_recycling is None: if no_recycling is None:
no_recycling = [] no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph: if self.fgraph is not None and self.fgraph is not fgraph:
...@@ -606,7 +645,6 @@ class VM_Linker(link.LocalLinker): ...@@ -606,7 +645,6 @@ class VM_Linker(link.LocalLinker):
# XXX if k has no clients... what is it doing in the computation? # XXX if k has no clients... what is it doing in the computation?
if k.owner and k.clients: if k.owner and k.clients:
ls = [] ls = []
is_output = 0
for cl in k.clients: for cl in k.clients:
if cl[0] is not 'output': if cl[0] is not 'output':
ls += cl[0].outputs ls += cl[0].outputs
...@@ -623,9 +661,12 @@ class VM_Linker(link.LocalLinker): ...@@ -623,9 +661,12 @@ class VM_Linker(link.LocalLinker):
pre_call_clear = [storage_map[v] for v in self.no_recycling] pre_call_clear = [storage_map[v] for v in self.no_recycling]
if self.callback is not None: if self.callback is not None or (config.profile and config.profile_memory):
if self.use_cloop: if self.use_cloop and self.callback is not None:
logger.warn('CLoop does not support callback, using Stack VM.') logger.warn('CVM does not support callback, using Stack VM.')
if self.use_cloop and config.profile_memory:
warnings.warn(
'CVM does not support memory profile, using Stack VM.')
deps = None deps = None
if self.allow_gc: if self.allow_gc:
deps = self.compute_gc_dependencies(storage_map) deps = self.compute_gc_dependencies(storage_map)
......
...@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, ...@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
order=order)) order=order))
c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX, c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX,
order=order)) order=order))
f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))], f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])
mode=theano.compile.ProfileMode())
if any([x.op.__class__.__name__ == 'Gemm' for x in if any([x.op.__class__.__name__ == 'Gemm' for x in
f.maker.fgraph.toposort()]): f.maker.fgraph.toposort()]):
c_impl = f.profile.apply_cimpl.values() c_impl = [hasattr(thunk, 'cthunk')
for node, thunk in zip(f.fn.nodes, f.fn.thunks)
if node.op.__class__.__name__ == "Gemm"]
assert len(c_impl) == 1 assert len(c_impl) == 1
if c_impl[0]: if c_impl[0]:
impl = 'CPU (with direct Theano binding to blas)' impl = 'CPU (with direct Theano binding to blas)'
......
...@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure); ...@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
* *
*/ */
int _outstanding_mallocs[] = {0,0}; int _outstanding_mallocs[] = {0,0};
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
int _allocated_size = 0; int _allocated_size = 0;
int _max_allocated_size = 0;
const int TABLE_SIZE = 10000; const int TABLE_SIZE = 10000;
struct table_struct{ struct table_struct{
void* ptr; void* ptr;
...@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose) ...@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose)
"Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err)); "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
return NULL; return NULL;
} }
_outstanding_mallocs[0] += (rval != NULL); if (rval != NULL){
#if COMPUTE_GPU_MEM_USED // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
// Could this be what happen if size is 0?
_outstanding_mallocs[0] += 1;
#if COMPUTE_GPU_MEM_USED
_allocated_size += size;
_max_allocated_size = std::max(_max_allocated_size, _allocated_size);
for(int i=0;i<TABLE_SIZE;i++){ for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){ if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval; _alloc_size_table[i].ptr=rval;
...@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose) ...@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose)
break; break;
} }
} }
_allocated_size += size; #endif
#endif }
//fprintf(stderr, //fprintf(stderr,
//"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n", //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
//(long)size, cudaGetErrorString(err),_allocated_size,rval); //(long)size, cudaGetErrorString(err),_allocated_size,rval);
...@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) ...@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
/* /*
* Return the size in bytes that Theano currently have allocated on the gpu. * Return the size in bytes that Theano currently have allocated on the gpu.
...@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) ...@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
PyObject * PyObject *
GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy) GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
{ {
return PyLong_FromLong(_allocated_size); PyObject* a = PyLong_FromLong(_allocated_size);
PyObject* b = PyLong_FromLong(_max_allocated_size);
PyObject* tuple = PyTuple_New(2);
PyTuple_SetItem(tuple, 0, a);
PyTuple_SetItem(tuple, 1, b);
return tuple;
} }
#endif #endif
...@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = { ...@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = {
(setter)CudaNdarray_set_strides, (setter)CudaNdarray_set_strides,
"data pointer strides (in elements)", "data pointer strides (in elements)",
NULL}, NULL},
{"strides",
(getter)CudaNdarray_get_strides,
(setter)CudaNdarray_set_strides,
"data pointer strides (in elements)",
NULL},
//gpudata is needed to allow calling pycuda fct with CudaNdarray input. //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
{"gpudata", {"gpudata",
(getter)CudaNdarray_get_dev_data, (getter)CudaNdarray_get_dev_data,
......
...@@ -417,6 +417,15 @@ class CudaNdarrayType(Type): ...@@ -417,6 +417,15 @@ class CudaNdarrayType(Type):
def c_compile_args(self): def c_compile_args(self):
return [] return []
def get_shape_info(self, obj):
return obj.shape
def get_size(self, shape_info):
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else: # a scalar
return numpy.dtype(self.dtype).itemsize
theano.compile.ops.expandable_types += (CudaNdarrayType,) theano.compile.ops.expandable_types += (CudaNdarrayType,)
# Register C code for ViewOp on CudaNdarrayType # Register C code for ViewOp on CudaNdarrayType
......
...@@ -423,6 +423,12 @@ class Scalar(Type): ...@@ -423,6 +423,12 @@ class Scalar(Type):
return (4,) # explicit T given in specialization of operator= return (4,) # explicit T given in specialization of operator=
# lines. This makes it compile with open64 # lines. This makes it compile with open64
def get_shape_info(self, obj):
return obj.itemsize
def get_size(self, shape_info):
return shape_info
# Register C code for ViewOp on Scalars. # Register C code for ViewOp on Scalars.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
Scalar, Scalar,
...@@ -460,6 +466,9 @@ class _scalar_py_operators: ...@@ -460,6 +466,9 @@ class _scalar_py_operators:
# variables and Tensor variables # variables and Tensor variables
ndim = 0 ndim = 0
dtype = property(lambda self: self.type.dtype)
""" The dtype of this scalar. """
#UNARY #UNARY
def __abs__(self): def __abs__(self):
return abs_(self) return abs_(self)
......
...@@ -147,6 +147,17 @@ class SparseType(gof.Type): ...@@ -147,6 +147,17 @@ class SparseType(gof.Type):
def is_valid_value(self, a): def is_valid_value(self, a):
return scipy.sparse.issparse(a) and (a.format == self.format) return scipy.sparse.issparse(a) and (a.format == self.format)
def get_shape_info(self, obj):
obj = self.filter(obj)
assert obj.indices.dtype == 'int32'
assert obj.indptr.dtype == 'int32'
return (obj.shape, obj.data.size,
obj.indices.size, obj.indptr.size, obj.nnz)
def get_size(self, shape_info):
return (shape_info[1] * numpy.dtype(self.dtype).itemsize +
(shape_info[2] + shape_info[3]) * numpy.dtype('int32').itemsize)
# Register SparseType's C code for ViewOp. # Register SparseType's C code for ViewOp.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
SparseType, SparseType,
......
...@@ -1198,6 +1198,38 @@ class TensorType(Type): ...@@ -1198,6 +1198,38 @@ class TensorType(Type):
""" """
return numpy.zeros(shape, dtype=self.dtype) return numpy.zeros(shape, dtype=self.dtype)
def get_shape_info(self, obj):
"""Return the information needed to compute the memory size of obj.
The memory size is only the data, so this exclude the container.
For an ndarray, this is the data, but not the ndarray object and
others data structures as shape and strides.
get_shape_info() and get_size() work in tendem for the memory profiler.
get_shape_info() is called during the execution of the function.
So it is better that it is not too slow.
get_size() will be called with the output of this function
when printing the memory profile.
:param obj: The object that this Type represent during execution
:return: Python object that self.get_size() understand
"""
return obj.shape
def get_size(self, shape_info):
""" Number of bytes taken by the object represented by shape_info
:param shape_info: the output of the call to get_shape_info()
:return: the number of bytes taken by the object described in
shape_info.
"""
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else: # a scalar
return numpy.dtype(self.dtype).itemsize
theano.compile.ops.expandable_types += (TensorType,) theano.compile.ops.expandable_types += (TensorType,)
# Register TensorType C code for ViewOp. # Register TensorType C code for ViewOp.
...@@ -5158,8 +5190,8 @@ def batched_dot(x, y): ...@@ -5158,8 +5190,8 @@ def batched_dot(x, y):
iterating over the first dimension using scan. iterating over the first dimension using scan.
Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4) Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
Example: Example:
>>> first = T.tensor3('first') >>> first = tensor.tensor3('first')
>>> second = T.tensor3('second') >>> second = tensor.tensor3('second')
>>> result = batched_dot(first, second) >>> result = batched_dot(first, second)
:note: This is a subset of numpy.einsum, but we do not provide it for now. :note: This is a subset of numpy.einsum, but we do not provide it for now.
But numpy einsum is slower than dot or tensordot: But numpy einsum is slower than dot or tensordot:
......
...@@ -56,6 +56,25 @@ class RandomStateType(gof.Type): ...@@ -56,6 +56,25 @@ class RandomStateType(gof.Type):
return False return False
return True return True
def get_shape_info(self, obj):
return None
def get_size(self, shape_info):
# The size is the data, that have constant size.
state = numpy.random.RandomState().get_state()
size = 0
for elem in state:
if isinstance(elem, str):
size += len(elem)
elif isinstance(elem, numpy.ndarray):
size += elem.size * elem.itemsize
elif isinstance(elem, int):
size += numpy.dtype("int").itemsize
elif isinstance(elem, float):
size += numpy.dtype("float").itemsize
else:
raise NotImplementedError()
return size
# Register RandomStateType's C code for ViewOp. # Register RandomStateType's C code for ViewOp.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
RandomStateType, RandomStateType,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论