Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
feff4f12
提交
feff4f12
authored
8月 27, 2014
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1934 from RoyXue/GSoC2014_part2
Compute minimum peak
上级
7af47dd8
023876a0
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
330 行增加
和
47 行删除
+330
-47
config.txt
doc/library/config.txt
+9
-0
profiling.py
theano/compile/profiling.py
+287
-37
test_profiling.py
theano/compile/tests/test_profiling.py
+31
-10
vm.py
theano/gof/vm.py
+3
-0
没有找到文件。
doc/library/config.txt
浏览文件 @
feff4f12
...
...
@@ -320,6 +320,15 @@ import theano and print the config variable, as in:
For the memory profile, do not print Apply nodes if the size
of their outputs (in bytes) is lower than this.
.. attribute:: profiling.min_peak_memory
Bool value: either True or False
Default False
Do the memory profile print the min peak memory usage?
It only works when profile=True, profile_memory=True
.. attribute:: config.lib.amdlibm
Bool value: either True or False
...
...
theano/compile/profiling.py
浏览文件 @
feff4f12
...
...
@@ -19,10 +19,12 @@ import copy
import
os
import
sys
import
time
from
theano.compat.python2x
import
defaultdict
import
numpy
import
theano
from
theano.gof
import
graph
from
theano.configparser
import
AddConfigVar
,
BoolParam
,
IntParam
...
...
@@ -54,6 +56,11 @@ AddConfigVar('profiling.min_memory_size',
IntParam
(
1024
,
lambda
i
:
i
>=
0
),
in_c_key
=
False
)
AddConfigVar
(
'profiling.min_peak_memory'
,
"""The min peak memory usage of the order"""
,
BoolParam
(
False
),
in_c_key
=
False
)
def
_atexit_print_fn
():
"""Print ProfileStat objects in _atexit_print_list to _atexit_print_file
...
...
@@ -641,7 +648,10 @@ class ProfileStats(object):
new_max_node_memory_saved_by_view
=
0
new_max_node_memory_saved_by_inplace
=
0
def
count_running_memory
(
order
,
thunk_old_storage
,
nodes_mem
):
# track min peak memory usage
min_max_peak
=
0
def
count_running_memory
(
order
,
fgraph
,
nodes_mem
):
"""
Calculate memory with specific node order
Return a list including the following values
...
...
@@ -658,88 +668,320 @@ class ProfileStats(object):
5. node_memory_saved_by_inplace
The sum of memory saved by reusing the input instead of
new allocation
"""
node_memory_size
=
0
running_memory_size
=
0
running_max_memory_size
=
0
node_memory_saved_by_view
=
0
node_memory_saved_by_inplace
=
0
# This take only the inputs/outputs dependencies.
dependencies
=
fgraph
.
profile
.
dependencies
# Initial compute_map which is used to check if a node is valid
compute_map
=
defaultdict
(
lambda
:
[
0
])
for
var
in
fgraph
.
inputs
:
compute_map
[
var
][
0
]
=
1
# two data structure used to mimic Python gc
viewed_by
=
{}
# {var1: [vars that view var1]}
# The len of the list is the value of python ref count. But we use a list, not just the ref count value.
# This is more safe to help detect potential bug in the algo
for
var
in
fgraph
.
variables
:
viewed_by
[
var
]
=
[]
view_of
=
{}
# {var1: original var viewed by var1}
# The orignal mean that we don't keep trac of all the intermediate relationship in the view.
for
node
in
order
:
val
=
nodes_mem
[
node
]
for
var
in
node
.
outputs
:
compute_map
[
var
][
0
]
=
1
idx
=
0
dmap
=
getattr
(
node
.
op
,
'destroy_map'
,
None
)
vmap
=
getattr
(
node
.
op
,
'view_map'
,
None
)
val
=
nodes_mem
[
node
]
for
idx
,
v
in
enumerate
(
val
)
:
for
v
in
val
:
# TODO check the op returned a view
if
dmap
and
idx
in
dmap
:
node_memory_saved_by_inplace
+=
v
# TODO check the op returned a view
elif
vmap
and
idx
in
vmap
:
node_memory_saved_by_view
+=
v
elif
not
isinstance
(
v
,
str
):
node_memory_size
+=
v
running_memory_size
+=
v
if
running_memory_size
>
running_max_memory_size
:
running_max_memory_size
=
running_memory_size
old_storage
=
thunk_old_storage
[
order
.
index
(
node
)]
for
old_s
in
old_storage
:
old_v
=
var_mem
[
node
.
inputs
[
old_s
]]
if
not
isinstance
(
old_v
,
str
):
running_memory_size
-=
old_v
return
[
node_memory_size
,
running_memory_size
,
running_max_memory_size
,
node_memory_saved_by_inplace
,
node_memory_saved_by_view
]
idx
+=
1
# Update the Python emulating dicts and add the memory
# allocated by the node
idx2
=
0
for
out
in
node
.
outputs
:
ins
=
None
if
dmap
and
idx2
in
dmap
:
vidx
=
dmap
[
idx2
]
assert
len
(
vidx
)
==
1
,
"Here we only support the possibility to destroy one input"
ins
=
node
.
inputs
[
vidx
[
0
]]
if
vmap
and
idx2
in
vmap
:
assert
ins
is
None
vidx
=
vmap
[
idx2
]
assert
len
(
vidx
)
==
1
,
"Here we only support the possibility to view one input"
ins
=
node
.
inputs
[
vidx
[
0
]]
if
ins
is
not
None
:
# This is needed for destroy_map in case it
# return a partial view that is destroyed. So
# the output could be different then the
# input.
assert
isinstance
(
ins
,
theano
.
Variable
)
# we keep trac of view only again the origin
origin
=
view_of
.
get
(
ins
,
ins
)
view_of
[
out
]
=
origin
viewed_by
[
origin
]
.
append
(
out
)
else
:
running_memory_size
+=
var_mem
[
out
]
node_memory_size
+=
var_mem
[
out
]
idx2
+=
1
running_max_memory_size
=
max
(
running_max_memory_size
,
running_memory_size
)
# Mimic the combination of Theano and Python gc
for
ins
in
node
.
inputs
:
assert
not
(
ins
in
view_of
and
viewed_by
[
ins
])
# we trac the original var, so this shouldn't happen
if
(
dependencies
[
ins
]
and
ins
not
in
fgraph
.
outputs
and
ins
.
owner
and
all
([
compute_map
[
v
][
0
]
for
v
in
dependencies
[
ins
]])):
if
ins
not
in
view_of
and
not
viewed_by
.
get
(
ins
,
[]):
running_memory_size
-=
var_mem
[
ins
]
elif
ins
in
view_of
:
origin
=
view_of
[
ins
]
viewed_by
[
origin
]
.
remove
(
ins
)
if
(
not
viewed_by
[
origin
]
and
origin
not
in
fgraph
.
inputs
and
not
isinstance
(
origin
,
theano
.
Constant
)):
running_memory_size
-=
var_mem
[
origin
]
else
:
# ins is viewed_by something else, so its
# memory isn't freed
pass
return
[
node_memory_size
,
running_memory_size
,
running_max_memory_size
,
node_memory_saved_by_inplace
,
node_memory_saved_by_view
]
def
count_minimum_peak
(
node_list
,
fgraph
,
nodes_mem
):
global
mem_count
,
mem_bound
,
max_mem_count
node_list
=
list
(
node_list
)
mem_count
=
0
max_mem_count
=
0
mem_bound
=
numpy
.
inf
# This take only the inputs/outputs dependencies.
dependencies
=
fgraph
.
profile
.
dependencies
# Initial compute_map which is used to check if a node is valid
compute_map
=
defaultdict
(
lambda
:
[
0
])
for
var
in
fgraph
.
inputs
:
compute_map
[
var
][
0
]
=
1
def
check_node_state
(
node
):
"""
Check if an Apply node is valid(has inputs).
:param node: Apply Node
"""
inputs
=
node
.
inputs
outputs
=
node
.
outputs
deps
=
inputs
+
node
.
destroy_dependencies
# TODO: Move at compute_map creation to speed things up.
for
node
in
inputs
:
if
isinstance
(
node
,
graph
.
Constant
):
compute_map
[
node
][
0
]
=
1
computed_ins
=
all
(
compute_map
[
v
][
0
]
for
v
in
deps
)
return
computed_ins
# Initial executable_nodes
executable_nodes
=
set
()
for
var
in
fgraph
.
inputs
:
for
c
,
_
in
var
.
clients
:
if
c
!=
"output"
and
check_node_state
(
c
):
executable_nodes
.
add
(
c
)
def
min_memory_generator
(
executable_nodes
,
viewed_by
,
view_of
):
"""
Generate all valid node order from node_list
and compute its memory peak.
:param executable_nodes: Set of executable nodes
"""
global
mem_count
,
mem_bound
,
max_mem_count
for
node
in
executable_nodes
:
new_exec_nodes
=
executable_nodes
.
copy
()
new_exec_nodes
.
remove
(
node
)
# Check if cut path now
if
max_mem_count
>
mem_bound
:
continue
view_of_temp
=
view_of
.
copy
()
# We don't want a shallow copy, but we don't want
# a deep copy. So this do a "middle" copy, where
# we copy the dict and the list, but not the var
viewed_by_temp
=
{}
for
k
,
v
in
viewed_by
.
iteritems
():
viewed_by_temp
[
k
]
=
list
(
v
)
for
var
in
node
.
outputs
:
compute_map
[
var
][
0
]
=
1
mem_created
=
0
mem_freed
=
0
max_storage
=
max_mem_count
dmap
=
getattr
(
node
.
op
,
'destroy_map'
,
None
)
vmap
=
getattr
(
node
.
op
,
'view_map'
,
None
)
idx
=
0
# Update the Python emulating dicts and add the
# memory allocated by the node
for
out
in
node
.
outputs
:
ins
=
None
if
dmap
and
idx
in
dmap
:
vidx
=
dmap
[
idx
]
assert
len
(
vidx
)
==
1
,
"Here we only support the possibility to destroy one input"
ins
=
node
.
inputs
[
vidx
[
0
]]
if
vmap
and
idx
in
vmap
:
assert
ins
is
None
vidx
=
vmap
[
idx
]
assert
len
(
vidx
)
==
1
,
"Here we only support the possibility to destroy one input"
ins
=
node
.
inputs
[
vidx
[
0
]]
if
ins
is
not
None
:
# This is needed for destroy_map in case it
# return a partial view that is destroyed. So
# the output could be different then the
# input.
assert
isinstance
(
ins
,
theano
.
Variable
)
# We keep trac of view only again the original
origin
=
view_of_temp
.
get
(
ins
,
ins
)
view_of_temp
[
out
]
=
origin
viewed_by_temp
[
origin
]
.
append
(
out
)
else
:
mem_created
+=
var_mem
[
out
]
idx
+=
1
mem_count
+=
mem_created
max_mem_count
=
max
(
max_mem_count
,
mem_count
)
# Mimic the combination of Theano and Python gc.
for
ins
in
node
.
inputs
:
assert
not
(
ins
in
view_of_temp
and
viewed_by_temp
[
ins
])
# We track of the original var, so this shouldn't happen
if
(
dependencies
[
ins
]
and
ins
not
in
fgraph
.
outputs
and
ins
.
owner
and
all
([
compute_map
[
v
][
0
]
for
v
in
dependencies
[
ins
]])):
if
ins
not
in
view_of_temp
and
not
viewed_by_temp
.
get
(
ins
,
[]):
mem_freed
+=
var_mem
[
ins
]
elif
ins
in
view_of_temp
:
origin
=
view_of_temp
[
ins
]
viewed_by_temp
[
origin
]
.
remove
(
ins
)
if
(
not
viewed_by_temp
[
origin
]
and
origin
not
in
fgraph
.
inputs
and
not
isinstance
(
origin
,
theano
.
Constant
)):
mem_freed
+=
var_mem
[
origin
]
else
:
# ins is viewed_by something else, so its
# memory isn't freed
pass
mem_count
-=
mem_freed
for
var
in
node
.
outputs
:
for
c
,
_
in
var
.
clients
:
if
c
!=
"output"
and
check_node_state
(
c
):
new_exec_nodes
.
add
(
c
)
if
not
new_exec_nodes
:
yield
[
node
]
# Check and Update mem_bound
if
max_mem_count
<
mem_bound
:
mem_bound
=
max_mem_count
else
:
for
p
in
min_memory_generator
(
new_exec_nodes
,
viewed_by_temp
,
view_of_temp
):
yield
[
node
]
+
p
# Reset track variables
mem_count
-=
mem_created
max_mem_count
=
max_storage
mem_count
+=
mem_freed
for
var
in
node
.
outputs
:
compute_map
[
var
][
0
]
=
0
# two data structure used to mimic Python gc
viewed_by
=
{}
# {var1: [vars that view var1]}
# The len of the list is the value of python ref count. But we use a list, not just the ref count value.
# This is more safe to help detect potential bug in the algo
for
var
in
fgraph
.
variables
:
viewed_by
[
var
]
=
[]
view_of
=
{}
# {var1: original var viewed by var1}
# The orignal mean that we don't keep trac of all the intermediate relationship in the view.
# Loop all valid orders and find min peak(store in mem_bound)
for
order
in
min_memory_generator
(
executable_nodes
,
viewed_by
,
view_of
):
continue
return
mem_bound
for
fgraph
,
nodes_mem
in
fct_memory
.
iteritems
():
# Sum of the size of all variables in bytes
sum_size
=
sum
([
sum
([
v
for
v
in
val
if
not
isinstance
(
v
,
str
)])
for
key
,
val
in
nodes_mem
.
iteritems
()])
for
key
,
val
in
nodes_mem
.
iteritems
()])
order
=
fgraph
.
toposort
()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage
=
[]
computed
,
last_user
=
theano
.
gof
.
link
.
gc_helper
(
order
)
for
node
in
order
:
post_thunk_old_storage
.
append
([
input_idx
for
input_idx
,
input
in
enumerate
(
node
.
inputs
)
if
(
input
in
computed
)
and
(
input
not
in
fgraph
.
outputs
)
and
node
==
last_user
[
input
]])
old_running_memory
=
count_running_memory
(
order
,
post_thunk_old_storage
,
nodes_mem
)
old_running_memory
=
count_running_memory
(
order
,
fgraph
,
nodes_mem
)
new_order
=
fgraph
.
profile
.
node_executed_order
# A list of new executed node order
new_storage
=
fgraph
.
profile
.
node_cleared_order
# A list of variables that get freed
new_running_memory
=
count_running_memory
(
new_order
,
new_storage
,
nodes_mem
)
new_running_memory
=
count_running_memory
(
new_order
,
fgraph
,
nodes_mem
)
# Store the max of some stats by any function in this profile.
max_sum_size
=
max
(
max_sum_size
,
sum_size
)
max_node_memory_size
=
max
(
max_node_memory_size
,
old_running_memory
[
0
])
max_node_memory_size
=
max
(
max_node_memory_size
,
old_running_memory
[
0
])
max_running_max_memory_size
=
max
(
max_running_max_memory_size
,
old_running_memory
[
2
])
old_running_memory
[
2
])
max_node_memory_saved_by_view
=
max
(
max_node_memory_saved_by_view
,
old_running_memory
[
4
])
max_node_memory_saved_by_inplace
=
max
(
max_node_memory_saved_by_inplace
,
old_running_memory
[
3
])
# Store max of some stats with new order
new_max_node_memory_size
=
max
(
new_max_node_memory_size
,
new_running_memory
[
0
])
new_max_node_memory_size
=
max
(
new_max_node_memory_size
,
new_running_memory
[
0
])
new_max_running_max_memory_size
=
max
(
new_max_running_max_memory_size
,
new_running_memory
[
2
])
new_running_memory
[
2
])
new_max_node_memory_saved_by_view
=
max
(
new_max_node_memory_saved_by_view
,
new_running_memory
[
4
])
new_running_memory
[
4
])
new_max_node_memory_saved_by_inplace
=
max
(
new_max_node_memory_saved_by_inplace
,
new_running_memory
[
3
])
del
fgraph
,
nodes_mem
,
post_thunk_old_storage
,
node
# Config: whether print min memory peak
if
config
.
profiling
.
min_peak_memory
:
node_list
=
fgraph
.
apply_nodes
min_peak
=
count_minimum_peak
(
node_list
,
fgraph
,
nodes_mem
)
min_max_peak
=
max
(
min_max_peak
,
min_peak
)
del
fgraph
,
nodes_mem
if
len
(
fct_memory
)
>
1
:
print
>>
file
,
(
"Memory Profile "
...
...
@@ -760,6 +1002,9 @@ class ProfileStats(object):
print
>>
file
,
" Max if linker=cvm(default):
%
dKB (
%
dKB)"
%
(
int
(
round
(
new_max_running_max_memory_size
/
1024.
)),
int
(
round
(
max_running_max_memory_size
/
1024.
)))
if
min_max_peak
:
print
>>
file
,
" Minimum peak from all valid apply node order is
%
dKB"
%
int
(
round
(
min_max_peak
/
1024.
))
print
>>
file
,
" Memory saved if views are used:
%
dKB (
%
dKB)"
%
(
int
(
round
(
new_max_node_memory_saved_by_view
/
1024.
)),
int
(
round
(
max_node_memory_saved_by_view
/
1024.
)))
...
...
@@ -837,6 +1082,7 @@ class ProfileStats(object):
" emitted in those cases."
)
print
>>
file
,
''
def
summary
(
self
,
file
=
sys
.
stderr
,
n_ops_to_print
=
20
,
n_apply_to_print
=
20
):
self
.
summary_function
(
file
)
...
...
@@ -857,6 +1103,8 @@ class ProfileStats(object):
self
.
optimizer_profile
[
1
])
if
0
:
# old code still to be ported from ProfileMode
def
long_print
(
self
,
file
=
sys
.
stderr
,
fct_name
=
None
,
message
=
None
,
n_apply_to_print
=
15
,
n_ops_to_print
=
20
,
print_apply
=
False
):
...
...
@@ -1157,6 +1405,8 @@ if 0: # old code still to be ported from ProfileMode
n_ops_to_print
=
n_ops_to_print
,
print_apply
=
False
)
class
ScanProfileStats
(
ProfileStats
):
callcount
=
0.0
nbsteps
=
0.0
...
...
theano/compile/tests/test_profiling.py
浏览文件 @
feff4f12
...
...
@@ -2,36 +2,57 @@
Test of memory profiling
"""
import
StringIO
import
numpy
import
theano
import
theano.tensor
as
T
import
StringIO
def
test_profiling
():
old1
=
theano
.
config
.
profile
old2
=
theano
.
config
.
profile_memory
config1
=
theano
.
config
.
profile
config2
=
theano
.
config
.
profile_memory
config3
=
theano
.
config
.
profiling
.
min_peak_memory
try
:
theano
.
config
.
profile
=
True
theano
.
config
.
profile_memory
=
True
theano
.
config
.
profiling
.
min_peak_memory
=
True
x
=
[
T
.
dvector
(
"val
%
i"
%
i
)
for
i
in
range
(
3
)]
z
=
[]
z
+=
[
T
.
outer
(
x
[
i
],
x
[
i
+
1
])
.
sum
(
axis
=
1
)
for
i
in
range
(
len
(
x
)
-
1
)]
z
+=
[
x
[
i
]
+
x
[
i
+
1
]
for
i
in
range
(
len
(
x
)
-
1
)]
x
=
T
.
dvector
(
"x"
)
y
=
T
.
dvector
(
"y"
)
z
=
x
+
y
p
=
theano
.
ProfileStats
(
False
)
if
theano
.
config
.
mode
in
[
"DebugMode"
,
"DEBUG_MODE"
]:
m
=
"FAST_RUN"
else
:
m
=
None
f
=
theano
.
function
([
x
,
y
],
z
,
profile
=
p
,
name
=
"test_profiling"
,
f
=
theano
.
function
(
x
,
z
,
profile
=
p
,
name
=
"test_profiling"
,
mode
=
m
)
output
=
f
([
1
,
2
,
3
,
4
],
[
1
,
1
,
1
,
1
])
inp
=
[
numpy
.
arange
(
1024
)
+
1
for
i
in
range
(
len
(
x
))]
output
=
f
(
*
inp
)
buf
=
StringIO
.
StringIO
()
f
.
profile
.
summary
(
buf
)
# regression testing for future algo speed up
the_string
=
buf
.
getvalue
()
lines1
=
[
l
for
l
in
the_string
.
split
(
"
\n
"
)
if
"Max if linker"
in
l
]
lines2
=
[
l
for
l
in
the_string
.
split
(
"
\n
"
)
if
"Minimum peak"
in
l
]
assert
"Max if linker=cvm(default): 8224KB (16408KB)"
in
the_string
,
(
lines1
,
lines2
)
assert
"Minimum peak from all valid apply node order is 8208KB"
in
the_string
,
(
lines1
,
lines2
)
finally
:
theano
.
config
.
profile
=
old1
theano
.
config
.
profile_memory
=
old2
theano
.
config
.
profile
=
config1
theano
.
config
.
profile_memory
=
config2
theano
.
config
.
profiling
.
min_peak_memory
=
config3
if
__name__
==
'__main__'
:
...
...
theano/gof/vm.py
浏览文件 @
feff4f12
...
...
@@ -149,6 +149,9 @@ class VM(object):
if
hasattr
(
self
,
'node_cleared_order'
):
profile
.
node_cleared_order
=
self
.
node_cleared_order
[:]
if
hasattr
(
self
,
'dependencies'
):
profile
.
dependencies
=
self
.
dependencies
.
copy
()
# clear the timer info out of the buffers
for
i
in
xrange
(
len
(
self
.
call_times
)):
self
.
call_times
[
i
]
=
0.0
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论