Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
0fa004e1
提交
0fa004e1
authored
12月 29, 2020
作者:
Michael Osthege
提交者:
Brandon T. Willard
1月 02, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Replace custom locking mechanism with filelock
Closes #203
上级
a2f9752f
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
45 行增加
和
347 行删除
+45
-347
theano_cache.py
bin/theano_cache.py
+1
-1
requirements.txt
requirements.txt
+1
-0
setup.py
setup.py
+1
-1
compilelock.py
theano/compile/compilelock.py
+42
-345
没有找到文件。
bin/theano_cache.py
浏览文件 @
0fa004e1
...
...
@@ -87,7 +87,7 @@ def main():
cache
=
get_module_cache
(
init_args
=
dict
(
do_refresh
=
False
))
cache
.
clear_old
()
elif
sys
.
argv
[
1
]
==
"unlock"
:
theano
.
compile
.
compilelock
.
force_unlock
()
theano
.
compile
.
compilelock
.
force_unlock
(
config
.
compiledir
)
print
(
"Lock successfully removed!"
)
elif
sys
.
argv
[
1
]
==
"purge"
:
theano
.
compile
.
compiledir
.
compiledir_purge
()
...
...
requirements.txt
浏览文件 @
0fa004e1
-e ./
filelock
flake8
==3.8.4
pep8
pyflakes
...
...
setup.py
浏览文件 @
0fa004e1
...
...
@@ -57,7 +57,7 @@ if __name__ == "__main__":
license
=
LICENSE
,
platforms
=
PLATFORMS
,
packages
=
find_packages
(
exclude
=
[
"tests.*"
]),
install_requires
=
[
"numpy>=1.9.1"
,
"scipy>=0.14"
],
install_requires
=
[
"numpy>=1.9.1"
,
"scipy>=0.14"
,
"filelock"
],
package_data
=
{
""
:
[
"*.txt"
,
...
...
theano/compile/compilelock.py
浏览文件 @
0fa004e1
...
...
@@ -2,14 +2,12 @@
Locking mechanism to ensure no two compilations occur simultaneously
in the same compilation directory (which can cause crashes).
"""
import
atexit
import
logging
import
os
import
socket
import
t
ime
import
threading
import
t
yping
from
contextlib
import
contextmanager
import
numpy
as
np
import
filelock
from
theano.configdefaults
import
config
...
...
@@ -20,362 +18,61 @@ __all__ = [
]
_random
=
np
.
random
.
RandomState
([
2015
,
8
,
2
])
local_mem
=
threading
.
local
()
local_mem
.
_locks
:
typing
.
Dict
[
str
,
bool
]
=
{}
_logger
=
logging
.
getLogger
(
"theano.compile.compilelock"
)
# If the user provided a logging level, we don't want to override it.
if
_logger
.
level
==
logging
.
NOTSET
:
# INFO will show the "Refreshing lock" messages
_logger
.
setLevel
(
logging
.
INFO
)
def
force_unlock
():
"""
Delete the compilation lock if someone else has it.
"""
get_lock
(
min_wait
=
0
,
max_wait
=
0.001
,
timeout
=
0
)
release_lock
()
@contextmanager
def
lock_ctx
(
lock_dir
=
None
,
**
kw
):
get_lock
(
lock_dir
=
lock_dir
,
**
kw
)
yield
release_lock
()
# We define this name with an underscore so that python shutdown
# deletes this before non-underscore names (like os). We need to do
# it this way to avoid errors on shutdown.
def
_get_lock
(
lock_dir
=
None
,
**
kw
):
"""
Obtain lock on compilation directory.
def
force_unlock
(
lock_dir
:
os
.
PathLike
):
"""Forces the release of the lock on a specific directory.
Parameters
----------
kw
Additional arguments to be forwarded to the `lock` function when
acquiring the lock.
Notes
-----
We can lock only on 1 directory at a time.
"""
if
lock_dir
is
None
:
lock_dir
=
os
.
path
.
join
(
config
.
compiledir
,
"lock_dir"
)
if
not
hasattr
(
get_lock
,
"n_lock"
):
# Initialization.
get_lock
.
n_lock
=
0
get_lock
.
lock_dir
=
lock_dir
get_lock
.
unlocker
=
Unlocker
(
get_lock
.
lock_dir
)
else
:
if
lock_dir
!=
get_lock
.
lock_dir
:
# Compilation directory has changed.
# First ensure all old locks were released.
assert
get_lock
.
n_lock
==
0
# Update members for new compilation directory.
get_lock
.
lock_dir
=
lock_dir
get_lock
.
unlocker
=
Unlocker
(
get_lock
.
lock_dir
)
# Only really try to acquire the lock if we do not have it already.
if
get_lock
.
n_lock
==
0
:
lock
(
get_lock
.
lock_dir
,
**
kw
)
atexit
.
register
(
Unlocker
.
unlock
,
get_lock
.
unlocker
)
# Store time at which the lock was set.
get_lock
.
start_time
=
time
.
time
()
else
:
# Check whether we need to 'refresh' the lock. We do this
# every 'config.compile__timeout / 2' seconds to ensure
# no one else tries to override our lock after their
# 'config.compile__timeout' timeout period.
if
get_lock
.
start_time
is
None
:
# This should not happen. So if this happen, clean up
# the lock state and raise an error.
while
get_lock
.
n_lock
>
0
:
release_lock
()
raise
Exception
(
"For some unknow reason, the lock was already "
"taken, but no start time was registered."
)
now
=
time
.
time
()
if
now
-
get_lock
.
start_time
>
config
.
compile__timeout
/
2
:
lockpath
=
os
.
path
.
join
(
get_lock
.
lock_dir
,
"lock"
)
_logger
.
info
(
f
"Refreshing lock {lockpath}"
)
refresh_lock
(
lockpath
)
get_lock
.
start_time
=
now
get_lock
.
n_lock
+=
1
get_lock
=
_get_lock
def
release_lock
():
lock_dir : os.PathLike
Path to a directory that was locked with `lock_ctx`.
"""
Release lock on compilation directory.
"""
get_lock
.
n_lock
-=
1
assert
get_lock
.
n_lock
>=
0
# Only really release lock once all lock requests have ended.
if
get_lock
.
n_lock
==
0
:
get_lock
.
start_time
=
None
get_lock
.
unlocker
.
unlock
(
force
=
False
)
fl
=
filelock
.
FileLock
(
os
.
path
.
join
(
lock_dir
,
".lock"
))
fl
.
release
(
force
=
True
)
# This is because None is a valid input for timeout
notset
=
object
()
dir_key
=
f
"{lock_dir}-{os.getpid()}"
if
dir_key
in
local_mem
.
_locks
:
del
local_mem
.
_locks
[
dir_key
]
def
lock
(
tmp_dir
,
timeout
=
notset
,
min_wait
=
None
,
max_wait
=
None
,
verbosity
=
1
):
"""
Obtain lock access by creating a given temporary directory (whose base will
be created if needed, but will not be deleted after the lock is removed).
If access is refused by the same lock owner during more than 'timeout'
seconds, then the current lock is overridden. If timeout is None, then no
timeout is performed.
The lock is performed by creating a 'lock' file in 'tmp_dir' that contains
a unique id identifying the owner of the lock (the process id, followed by
a random string).
When there is already a lock, the process sleeps for a random amount of
time between min_wait and max_wait seconds before trying again.
If 'verbosity' is >= 1, then a message will be displayed when we need to
wait for the lock. If it is set to a value >1, then this message will be
displayed each time we re-check for the presence of the lock. Otherwise it
is displayed only when we notice the lock's owner has changed.
@contextmanager
def
lock_ctx
(
lock_dir
:
os
.
PathLike
=
None
,
*
,
timeout
:
typing
.
Optional
[
float
]
=
-
1
):
"""Context manager that wraps around FileLock and SoftFileLock from filelock package.
Parameters
----------
tmp_dir : str
Lock directory that will be created when acquiring the lock.
timeout : int or None
Time (in seconds) to wait before replacing an existing lock (default
config 'compile__timeout').
min_wait: int
Minimum time (in seconds) to wait before trying again to get the lock
(default config 'compile__wait').
max_wait: int
Maximum time (in seconds) to wait before trying again to get the lock
(default 2 * min_wait).
verbosity : int
Amount of feedback displayed to screen (default 1).
lock_dir : str
A directory for which to acquire the lock.
Defaults to the config.compiledir.
timeout : float
Timeout in seconds for waiting in lock acquisition.
Defaults to config.compile__timeout.
"""
if
min_wait
is
None
:
min_wait
=
config
.
compile__wait
if
max_wait
is
None
:
max_wait
=
min_wait
*
2
if
timeout
is
notset
:
if
lock_dir
is
None
:
lock_dir
=
config
.
compiledir
if
timeout
==
-
1
:
timeout
=
config
.
compile__timeout
# Create base of lock directory if required.
base_lock
=
os
.
path
.
dirname
(
tmp_dir
)
if
not
os
.
path
.
isdir
(
base_lock
):
try
:
os
.
makedirs
(
base_lock
)
except
OSError
:
# Someone else was probably trying to create it at the same time.
# We wait two seconds just to make sure the following assert does
# not fail on some NFS systems.
time
.
sleep
(
2
)
assert
os
.
path
.
isdir
(
base_lock
)
elif
timeout
is
not
None
or
timeout
<=
0
:
raise
ValueError
(
f
"Timeout parameter must be None or positive. Got {timeout}."
)
# Variable initialization.
lock_file
=
os
.
path
.
join
(
tmp_dir
,
"lock"
)
hostname
=
socket
.
gethostname
()
my_pid
=
os
.
getpid
()
no_display
=
verbosity
==
0
# locks are kept in a dictionary to account for changing compiledirs
dir_key
=
f
"{lock_dir}-{os.getpid()}"
nb_error
=
0
# The number of time we sleep when their is no errors.
# Used to don't display it the first time to display it less frequently.
# And so don't get as much email about this!
nb_wait
=
0
# Acquire lock.
while
True
:
if
dir_key
not
in
local_mem
.
_locks
:
local_mem
.
_locks
[
dir_key
]
=
True
fl
=
filelock
.
FileLock
(
os
.
path
.
join
(
lock_dir
,
".lock"
))
fl
.
acquire
(
timeout
=
timeout
)
try
:
last_owner
=
"no_owner"
time_start
=
time
.
time
()
other_dead
=
False
while
os
.
path
.
isdir
(
tmp_dir
):
try
:
with
open
(
lock_file
)
as
f
:
read_owner
=
f
.
readlines
()[
0
]
.
strip
()
# The try is transition code for old locks.
# It may be removed when people have upgraded.
try
:
other_host
=
read_owner
.
split
(
"_"
)[
2
]
except
IndexError
:
other_host
=
()
# make sure it isn't equal to any host
if
other_host
==
hostname
:
try
:
# Just check if the other process still exist.
os
.
kill
(
int
(
read_owner
.
split
(
"_"
)[
0
]),
0
)
except
OSError
:
other_dead
=
True
except
AttributeError
:
pass
# os.kill does not exist on windows
except
Exception
:
read_owner
=
"failure"
if
other_dead
:
if
not
no_display
:
msg
=
f
"process '{read_owner.split('_')[0]}'"
_logger
.
warning
(
f
"Overriding existing lock by dead {msg} "
f
"(I am process '{my_pid}')"
,
)
get_lock
.
unlocker
.
unlock
(
force
=
True
)
continue
if
last_owner
==
read_owner
:
if
timeout
is
not
None
and
time
.
time
()
-
time_start
>=
timeout
:
# Timeout exceeded or locking process dead.
if
not
no_display
:
if
read_owner
==
"failure"
:
msg
=
"unknown process"
else
:
msg
=
f
"process '{read_owner.split('_')[0]}'"
_logger
.
warning
(
f
"Overriding existing lock by {msg} "
f
"(I am process '{my_pid}')"
,
)
get_lock
.
unlocker
.
unlock
(
force
=
True
)
continue
else
:
last_owner
=
read_owner
time_start
=
time
.
time
()
no_display
=
verbosity
==
0
if
not
no_display
and
nb_wait
>
0
:
if
read_owner
==
"failure"
:
msg
=
"unknown process"
else
:
msg
=
f
"process '{read_owner.split('_')[0]}'"
_logger
.
info
(
f
"Waiting for existing lock by {msg} (I am "
f
"process '{my_pid}')"
,
)
_logger
.
info
(
f
"To manually release the lock, delete {tmp_dir}"
)
if
verbosity
<=
1
:
no_display
=
True
nb_wait
+=
1
time
.
sleep
(
_random
.
uniform
(
min_wait
,
max_wait
))
try
:
os
.
mkdir
(
tmp_dir
)
except
FileExistsError
:
# Error while creating the directory: someone else
# must have tried at the exact same time.
nb_error
+=
1
if
nb_error
<
10
:
continue
else
:
raise
# Safety check: the directory should be here.
assert
os
.
path
.
isdir
(
tmp_dir
)
# Write own id into lock file.
unique_id
=
refresh_lock
(
lock_file
)
# Verify we are really the lock owner (this should not be needed,
# but better be safe than sorry).
with
open
(
lock_file
)
as
f
:
owner
=
f
.
readlines
()[
0
]
.
strip
()
if
owner
!=
unique_id
:
# Too bad, try again.
continue
else
:
# We got the lock, hoorray!
return
except
Exception
as
e
:
# If something wrong happened, we try again.
_logger
.
warning
(
f
"Something wrong happened: {type(e)} {e}"
)
nb_error
+=
1
if
nb_error
>
10
:
raise
time
.
sleep
(
_random
.
uniform
(
min_wait
,
max_wait
))
continue
def
refresh_lock
(
lock_file
):
"""
'Refresh' an existing lock by re-writing the file containing the owner's
unique id, using a new (randomly generated) id, which is also returned.
"""
unique_id
=
"{}_{}_{}"
.
format
(
os
.
getpid
(),
""
.
join
([
str
(
_random
.
randint
(
0
,
9
))
for
i
in
range
(
10
)]),
socket
.
gethostname
(),
)
try
:
with
open
(
lock_file
,
"w"
)
as
lock_write
:
lock_write
.
write
(
unique_id
+
"
\n
"
)
except
Exception
:
# In some strange case, this happen. To prevent all tests
# from failing, we release the lock, but as there is a
# problem, we still keep the original exception.
# This way, only 1 test would fail.
while
get_lock
.
n_lock
>
0
:
release_lock
()
_logger
.
warning
(
"Refreshing lock failed, we release the"
" lock before raising again the exception"
)
raise
return
unique_id
class
Unlocker
:
"""
Class wrapper around release mechanism so that the lock is automatically
released when the program exits (even when crashing or being interrupted),
using the __del__ class method.
"""
def
__init__
(
self
,
tmp_dir
):
self
.
tmp_dir
=
tmp_dir
def
unlock
(
self
,
force
=
False
):
"""
Remove current lock.
This function does not crash if it is unable to properly
delete the lock file and directory. The reason is that it
should be allowed for multiple jobs running in parallel to
unlock the same directory at the same time (e.g. when reaching
their timeout limit).
"""
# If any error occurs, we assume this is because someone else tried to
# unlock this directory at the same time.
# Note that it is important not to have both remove statements within
# the same try/except block. The reason is that while the attempt to
# remove the file may fail (e.g. because for some reason this file does
# not exist), we still want to try and remove the directory.
# Check if someone else didn't took our lock.
lock_file
=
os
.
path
.
join
(
self
.
tmp_dir
,
"lock"
)
if
not
force
:
try
:
with
open
(
lock_file
)
as
f
:
owner
=
f
.
readlines
()[
0
]
.
strip
()
pid
,
_
,
hname
=
owner
.
split
(
"_"
)
if
pid
!=
str
(
os
.
getpid
())
or
hname
!=
socket
.
gethostname
():
return
except
Exception
:
pass
try
:
os
.
remove
(
lock_file
)
except
Exception
:
pass
try
:
os
.
rmdir
(
self
.
tmp_dir
)
except
Exception
:
pass
yield
finally
:
if
fl
.
is_locked
:
fl
.
release
()
if
dir_key
in
local_mem
.
_locks
:
del
local_mem
.
_locks
[
dir_key
]
else
:
yield
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论