Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
f1cb3da8
提交
f1cb3da8
authored
5月 15, 2015
作者:
--global
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Move gpu tests from T_Scan to ScanGpuTests
上级
c3f94f29
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
429 行增加
和
400 行删除
+429
-400
test_scan.py
theano/scan_module/tests/test_scan.py
+429
-400
没有找到文件。
theano/scan_module/tests/test_scan.py
浏览文件 @
f1cb3da8
...
@@ -432,207 +432,6 @@ class T_Scan(unittest.TestCase):
...
@@ -432,207 +432,6 @@ class T_Scan(unittest.TestCase):
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
# as test_one_sequence_one_output_weights, but on the gpu
# This first version test the first case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu1
(
self
):
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
# The following line is needed to have the first case being used
# Otherwise, it is the second that is tested.
mode
=
mode_with_gpu
.
excluding
(
'InputToGpuOptimizer'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode
)
output
=
theano
.
sandbox
.
cuda
.
gpu_from_host
(
output
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
v_u
=
numpy
.
asarray
(
v_u
,
dtype
=
'float32'
)
v_x0
=
numpy
.
asarray
(
v_x0
,
dtype
=
'float32'
)
W
=
numpy
.
asarray
(
W
,
dtype
=
'float32'
)
W_in
=
numpy
.
asarray
(
W_in
,
dtype
=
'float32'
)
# compute the output in numpy
v_out
=
numpy
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
xrange
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
# TO DEL
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
HostFromGpu
)
for
node
in
topo
])
==
0
assert
sum
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuElemwise
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuFromHost
)
for
node
in
scan_node_topo
])
# This second version test the second case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu2
(
self
):
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out
=
numpy
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
xrange
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
HostFromGpu
)
for
node
in
topo
])
==
1
assert
sum
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuElemwise
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
GpuFromHost
)
for
node
in
scan_node_topo
])
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def
test_gpu3_mixture_dtype_outputs
(
self
):
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
(
u_t
*
W_in
+
x_tm1
*
W
,
tensor
.
cast
(
u_t
+
x_tm1
,
'int64'
))
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
[
x0
,
None
],
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out1
=
numpy
.
zeros
((
4
,))
v_out2
=
numpy
.
zeros
((
4
,),
dtype
=
'int64'
)
v_out1
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
v_out2
[
0
]
=
v_u
[
0
]
+
v_x0
for
step
in
xrange
(
1
,
4
):
v_out1
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out1
[
step
-
1
]
*
W
v_out2
[
step
]
=
numpy
.
int64
(
v_u
[
step
]
+
v_out1
[
step
-
1
])
theano_out1
,
theano_out2
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_out1
,
v_out1
)
utt
.
assert_allclose
(
theano_out2
,
v_out2
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
assert
scan_node
.
op
.
gpu
# simple rnn, one input, one state, weights for each; input/state
# simple rnn, one input, one state, weights for each; input/state
# are vectors, weights are scalars; using shared variables
# are vectors, weights are scalars; using shared variables
def
test_one_sequence_one_output_weights_shared
(
self
):
def
test_one_sequence_one_output_weights_shared
(
self
):
...
@@ -1339,88 +1138,6 @@ class T_Scan(unittest.TestCase):
...
@@ -1339,88 +1138,6 @@ class T_Scan(unittest.TestCase):
theano_v
=
my_f
()
theano_v
=
my_f
()
utt
.
assert_allclose
(
theano_v
,
numpy_v
[
5
:,
:])
utt
.
assert_allclose
(
theano_v
,
numpy_v
[
5
:,
:])
def
test_inconsistent_inner_fct
(
self
):
# Test that scan can detect inconsistencies in the inner graph and
# raises an appropriate exception.
# This test has not been extensively tested for Python 3 so it should
# be skipped if python version is >=3
version
=
sys
.
version_info
if
version
>=
(
3
,):
raise
SkipTest
(
"This test relies on a pickled file produced with "
"Python 2. The current python version "
"(
%
i.
%
i.
%
i.
%
i) is >= 3 so the test will be "
"skipped."
%
(
version
.
major
,
version
.
minor
,
version
.
micro
,
version
.
serial
))
# The pickled scan op used in this test requires the use of a gpu
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
# When unpickled, the scan op should perform validation on its inner
# graph, detect the inconsistencies and raise a TypeError
folder
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
path
=
os
.
path
.
join
(
folder
,
"inconsistent_scan.pkl"
)
assert_raises
(
TypeError
,
cPickle
.
load
,
open
(
path
,
"r"
))
def
test_consistent_inner_fct
(
self
):
# Test that scan does not falsely detect inconsistencies in a valid
# inner graph
# The pickled scan op used in this test requires the use of a gpu
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
rs
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
use_cuda
=
True
)
output
,
_
=
theano
.
scan
(
lambda
:
rs
.
uniform
((
3
,),
dtype
=
"float32"
),
n_steps
=
3
)
cPickle
.
loads
(
cPickle
.
dumps
(
output
))
# Also ensure that, after compilation, the Scan has been moved
# on the gpu
fct
=
theano
.
function
([],
output
,
mode
=
mode_with_gpu
)
scan_nodes
=
self
.
scan_nodes_from_fct
(
fct
)
assert
len
(
scan_nodes
)
==
1
assert
(
scan_nodes
[
0
]
.
op
.
info
.
get
(
'gpu'
,
False
)
or
scan_nodes
[
0
]
.
op
.
info
.
get
(
'gpua'
,
False
))
def
test_cuda_gibbs_chain
(
self
):
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_vsample
=
numpy
.
array
(
rng
.
binomial
(
1
,
.
5
,
size
=
(
3
,
20
),),
dtype
=
'float32'
)
vsample
=
theano
.
shared
(
v_vsample
)
trng
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
utt
.
fetch_seed
())
def
f
(
vsample_tm1
):
return
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
dtype
=
'float32'
)
*
vsample_tm1
theano_vsamples
,
updates
=
theano
.
scan
(
f
,
[],
vsample
,
[],
n_steps
=
10
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
)
my_f
=
theano
.
function
([],
theano_vsamples
[
-
1
],
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# I leave this to tested by debugmode, this test was anyway more of
# doest the graph compile kind of test
t_result
=
my_f
()
def
test_gibbs_chain
(
self
):
def
test_gibbs_chain
(
self
):
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_W
=
numpy
.
array
(
rng
.
rand
(
20
,
30
)
-
.
5
,
dtype
=
'float32'
)
v_W
=
numpy
.
array
(
rng
.
rand
(
20
,
30
)
-
.
5
,
dtype
=
'float32'
)
...
@@ -2880,86 +2597,7 @@ class T_Scan(unittest.TestCase):
...
@@ -2880,86 +2597,7 @@ class T_Scan(unittest.TestCase):
f2_vals
=
f2
(
x_val
)
f2_vals
=
f2
(
x_val
)
utt
.
assert_allclose
(
f_vals
,
f2_vals
)
utt
.
assert_allclose
(
f_vals
,
f2_vals
)
def
test_gpu_memory_usage
(
self
):
def
test_reduce_memory_consumption
(
self
):
# This test validates that the memory usage of the defined theano
# function is reasonnable when executed on the GPU. It checks for
# a bug in which one of scan's optimization was not applied which
# made the scan node compute large and unnecessary outputs which
# brought memory usage on the GPU to ~12G.
# The test must be performed on the GPU
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
# Dimensionality of input and output data (not one-hot coded)
n_in
=
100
n_out
=
100
# Number of neurons in hidden layer
n_hid
=
4000
# Number of minibatches
mb_size
=
2
# Time steps in minibatch
mb_length
=
200
# Define input variables
xin
=
tensor
.
ftensor3
(
name
=
'xin'
)
yout
=
tensor
.
ftensor3
(
name
=
'yout'
)
# Initialize the network parameters
floatX
=
theano
.
config
.
floatX
U
=
theano
.
shared
(
numpy
.
zeros
((
n_in
,
n_hid
),
dtype
=
"float32"
),
name
=
'W_xin_to_l1'
)
V
=
theano
.
shared
(
numpy
.
zeros
((
n_hid
,
n_hid
),
dtype
=
"float32"
),
name
=
'W_l1_to_l1'
)
W
=
theano
.
shared
(
numpy
.
zeros
((
n_hid
,
n_out
),
dtype
=
"float32"
),
name
=
'W_l1_to_l2'
)
nparams
=
[
U
,
V
,
W
]
# Build the forward pass
l1_base
=
tensor
.
dot
(
xin
,
U
)
def
scan_l
(
baseline
,
last_step
):
return
baseline
+
tensor
.
dot
(
last_step
,
V
)
zero_output
=
tensor
.
alloc
(
numpy
.
asarray
(
0.
,
dtype
=
"float32"
),
mb_size
,
n_hid
)
l1_out
,
_
=
theano
.
scan
(
scan_l
,
sequences
=
[
l1_base
],
outputs_info
=
[
zero_output
],
mode
=
mode_with_gpu
)
l2_out
=
tensor
.
dot
(
l1_out
,
W
)
# Compute the cost and take the gradient wrt params
cost
=
tensor
.
sum
((
l2_out
-
yout
)
**
2
)
grads
=
tensor
.
grad
(
cost
,
nparams
)
updates
=
zip
(
nparams
,
[
n
-
g
for
n
,
g
in
zip
(
nparams
,
grads
)])
# Compile the theano function
feval_backprop
=
theano
.
function
([
xin
,
yout
],
cost
,
updates
=
updates
,
mode
=
mode_with_gpu
)
# Validate that the PushOutScanOutput optimization has been applied
# by checking the number of outputs of the grad Scan node in the
# compiled function.
nodes
=
feval_backprop
.
maker
.
fgraph
.
toposort
()
scan_nodes
=
[
n
for
n
in
nodes
if
isinstance
(
n
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
# The grad scan is always the 2nd one according to toposort. If the
# optimization has been applied, it has 2 outputs, otherwise 3.
grad_scan_node
=
scan_nodes
[
1
]
assert
len
(
grad_scan_node
.
outputs
)
==
2
# Call the theano function to ensure the absence of a memory error
feval_backprop
(
numpy
.
zeros
((
mb_length
,
mb_size
,
n_in
),
dtype
=
"float32"
),
numpy
.
zeros
((
mb_length
,
mb_size
,
n_out
),
dtype
=
"float32"
))
def
test_reduce_memory_consumption
(
self
):
x
=
theano
.
shared
(
numpy
.
asarray
(
x
=
theano
.
shared
(
numpy
.
asarray
(
numpy
.
random
.
uniform
(
size
=
(
10
,)),
dtype
=
theano
.
config
.
floatX
))
numpy
.
random
.
uniform
(
size
=
(
10
,)),
dtype
=
theano
.
config
.
floatX
))
...
@@ -3949,43 +3587,6 @@ class T_Scan(unittest.TestCase):
...
@@ -3949,43 +3587,6 @@ class T_Scan(unittest.TestCase):
f
=
theano
.
function
([
seq
],
results
[
1
],
updates
=
updates
)
f
=
theano
.
function
([
seq
],
results
[
1
],
updates
=
updates
)
assert
numpy
.
all
(
exp_out
==
f
(
inp
))
assert
numpy
.
all
(
exp_out
==
f
(
inp
))
def
test_memory_reuse_gpudimshuffle
(
self
):
# Test the memory pre-allocation feature in scan when one output is
# the result of a GpuDimshuffle (because an optimization in
# GpuDimshuffle can cause issues with the memory pre-allocation
# where it falsely thinks that a pre-allocated memory region has
# been used when it hasn't).
from
theano.sandbox
import
cuda
if
not
cuda
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
def
inner_fn
(
seq1
,
recurrent_out
):
temp
=
seq1
+
recurrent_out
.
sum
()
output1
=
temp
.
dimshuffle
(
1
,
0
)
output2
=
temp
.
sum
()
+
recurrent_out
return
output1
,
output2
input1
=
theano
.
tensor
.
ftensor3
()
init
=
theano
.
tensor
.
ftensor3
()
outputs_info
=
[
None
,
init
]
out
,
_
=
theano
.
scan
(
inner_fn
,
sequences
=
[
input1
],
outputs_info
=
outputs_info
,
mode
=
mode_with_gpu
)
out1
=
out
[
0
]
.
flatten
()
out2
=
out
[
1
]
.
flatten
()
fct
=
theano
.
function
([
input1
,
init
],
[
out1
,
out2
],
mode
=
mode_with_gpu
)
output
=
fct
(
numpy
.
ones
((
2
,
1
,
1
),
dtype
=
"float32"
),
numpy
.
ones
((
1
,
1
,
1
),
dtype
=
"float32"
))
expected_output
=
(
numpy
.
array
([
2
,
4
],
dtype
=
"float32"
),
numpy
.
array
([
3
,
7
],
dtype
=
"float32"
))
utt
.
assert_allclose
(
output
,
expected_output
)
def
test_memory_reuse_with_outputs_as_inputs
(
self
):
def
test_memory_reuse_with_outputs_as_inputs
(
self
):
# Test the memory pre-allocation feature in scan for the following
# Test the memory pre-allocation feature in scan for the following
# cases :
# cases :
...
@@ -4418,6 +4019,434 @@ class T_Scan(unittest.TestCase):
...
@@ -4418,6 +4019,434 @@ class T_Scan(unittest.TestCase):
result_strict
=
f_strict
(
x0
)
result_strict
=
f_strict
(
x0
)
class
ScanGpuTests
:
""" This class defines a number of tests for Scan on GPU as well as a few
helper functions for these tests. The GPU tests defined in this class are
independant of the GPU backend used. Because of this, a class inheriting
from ScanGpuTests should define the following attributes and methods to
make the tests run on a specific backend :
- self.gpu_backend : Reference to the backend module
- self.mode_with_opt : Compilation mode to force usage of the gpu backend
- self.is_scan_on_gpu(node) : Method to determine is a scan node has been
moved to run on a gpu under the specific
backend. Returns a boolean.
"""
# as test_one_sequence_one_output_weights, but on the gpu
# This first version test the first case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu1
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
# The following line is needed to have the first case being used
# Otherwise, it is the second that is tested.
mode
=
self
.
mode_with_gpu
.
excluding
(
'InputToGpuOptimizer'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode
)
output
=
self
.
gpu_backend
.
gpu_from_host
(
output
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
v_u
=
numpy
.
asarray
(
v_u
,
dtype
=
'float32'
)
v_x0
=
numpy
.
asarray
(
v_x0
,
dtype
=
'float32'
)
W
=
numpy
.
asarray
(
W
,
dtype
=
'float32'
)
W_in
=
numpy
.
asarray
(
W_in
,
dtype
=
'float32'
)
# compute the output in numpy
v_out
=
numpy
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
xrange
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
# TO DEL
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
topo
])
==
0
assert
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuElemwise
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
scan_node_topo
])
# This second version test the second case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu2
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
self
.
mode_with_gpu
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out
=
numpy
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
xrange
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
topo
])
==
1
assert
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuElemwise
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
scan_node_topo
])
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def
test_gpu3_mixture_dtype_outputs
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
(
u_t
*
W_in
+
x_tm1
*
W
,
tensor
.
cast
(
u_t
+
x_tm1
,
'int64'
))
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
[
x0
,
None
],
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
self
.
mode_with_gpu
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out1
=
numpy
.
zeros
((
4
,))
v_out2
=
numpy
.
zeros
((
4
,),
dtype
=
'int64'
)
v_out1
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
v_out2
[
0
]
=
v_u
[
0
]
+
v_x0
for
step
in
xrange
(
1
,
4
):
v_out1
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out1
[
step
-
1
]
*
W
v_out2
[
step
]
=
numpy
.
int64
(
v_u
[
step
]
+
v_out1
[
step
-
1
])
theano_out1
,
theano_out2
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_out1
,
v_out1
)
utt
.
assert_allclose
(
theano_out2
,
v_out2
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
assert
self
.
is_scan_on_gpu
(
scan_node
)
def
test_gibbs_chain
(
self
):
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_vsample
=
numpy
.
array
(
rng
.
binomial
(
1
,
.
5
,
size
=
(
3
,
20
),),
dtype
=
'float32'
)
vsample
=
theano
.
shared
(
v_vsample
)
trng
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
utt
.
fetch_seed
())
def
f
(
vsample_tm1
):
return
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
dtype
=
'float32'
)
*
vsample_tm1
theano_vsamples
,
updates
=
theano
.
scan
(
f
,
[],
vsample
,
[],
n_steps
=
10
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
self
.
mode_with_gpu
)
my_f
=
theano
.
function
([],
theano_vsamples
[
-
1
],
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
)
# I leave this to tested by debugmode, this test was anyway more of
# doest the graph compile kind of test
t_result
=
my_f
()
def
test_gpu_memory_usage
(
self
):
# This test validates that the memory usage of the defined theano
# function is reasonnable when executed on the GPU. It checks for
# a bug in which one of scan's optimization was not applied which
# made the scan node compute large and unnecessary outputs which
# brought memory usage on the GPU to ~12G.
# Dimensionality of input and output data (not one-hot coded)
n_in
=
100
n_out
=
100
# Number of neurons in hidden layer
n_hid
=
4000
# Number of minibatches
mb_size
=
2
# Time steps in minibatch
mb_length
=
200
# Define input variables
xin
=
tensor
.
ftensor3
(
name
=
'xin'
)
yout
=
tensor
.
ftensor3
(
name
=
'yout'
)
# Initialize the network parameters
floatX
=
theano
.
config
.
floatX
U
=
theano
.
shared
(
numpy
.
zeros
((
n_in
,
n_hid
),
dtype
=
"float32"
),
name
=
'W_xin_to_l1'
)
V
=
theano
.
shared
(
numpy
.
zeros
((
n_hid
,
n_hid
),
dtype
=
"float32"
),
name
=
'W_l1_to_l1'
)
W
=
theano
.
shared
(
numpy
.
zeros
((
n_hid
,
n_out
),
dtype
=
"float32"
),
name
=
'W_l1_to_l2'
)
nparams
=
[
U
,
V
,
W
]
# Build the forward pass
l1_base
=
tensor
.
dot
(
xin
,
U
)
def
scan_l
(
baseline
,
last_step
):
return
baseline
+
tensor
.
dot
(
last_step
,
V
)
zero_output
=
tensor
.
alloc
(
numpy
.
asarray
(
0.
,
dtype
=
"float32"
),
mb_size
,
n_hid
)
l1_out
,
_
=
theano
.
scan
(
scan_l
,
sequences
=
[
l1_base
],
outputs_info
=
[
zero_output
],
mode
=
self
.
mode_with_gpu
)
l2_out
=
tensor
.
dot
(
l1_out
,
W
)
# Compute the cost and take the gradient wrt params
cost
=
tensor
.
sum
((
l2_out
-
yout
)
**
2
)
grads
=
tensor
.
grad
(
cost
,
nparams
)
updates
=
zip
(
nparams
,
[
n
-
g
for
n
,
g
in
zip
(
nparams
,
grads
)])
# Compile the theano function
feval_backprop
=
theano
.
function
([
xin
,
yout
],
cost
,
updates
=
updates
,
mode
=
self
.
mode_with_gpu
)
# Validate that the PushOutScanOutput optimization has been applied
# by checking the number of outputs of the grad Scan node in the
# compiled function.
nodes
=
feval_backprop
.
maker
.
fgraph
.
toposort
()
scan_nodes
=
[
n
for
n
in
nodes
if
isinstance
(
n
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
# The grad scan is always the 2nd one according to toposort. If the
# optimization has been applied, it has 2 outputs, otherwise 3.
grad_scan_node
=
scan_nodes
[
1
]
assert
len
(
grad_scan_node
.
outputs
)
==
2
# Call the theano function to ensure the absence of a memory error
feval_backprop
(
numpy
.
zeros
((
mb_length
,
mb_size
,
n_in
),
dtype
=
"float32"
),
numpy
.
zeros
((
mb_length
,
mb_size
,
n_out
),
dtype
=
"float32"
))
def
test_memory_reuse_gpudimshuffle
(
self
):
# Test the memory pre-allocation feature in scan when one output is
# the result of a GpuDimshuffle (because an optimization in
# GpuDimshuffle can cause issues with the memory pre-allocation
# where it falsely thinks that a pre-allocated memory region has
# been used when it hasn't).
def
inner_fn
(
seq1
,
recurrent_out
):
temp
=
seq1
+
recurrent_out
.
sum
()
output1
=
temp
.
dimshuffle
(
1
,
0
)
output2
=
temp
.
sum
()
+
recurrent_out
return
output1
,
output2
input1
=
theano
.
tensor
.
ftensor3
()
init
=
theano
.
tensor
.
ftensor3
()
outputs_info
=
[
None
,
init
]
out
,
_
=
theano
.
scan
(
inner_fn
,
sequences
=
[
input1
],
outputs_info
=
outputs_info
,
mode
=
self
.
mode_with_gpu
)
out1
=
out
[
0
]
.
flatten
()
out2
=
out
[
1
]
.
flatten
()
fct
=
theano
.
function
([
input1
,
init
],
[
out1
,
out2
],
mode
=
self
.
mode_with_gpu
)
output
=
fct
(
numpy
.
ones
((
2
,
1
,
1
),
dtype
=
"float32"
),
numpy
.
ones
((
1
,
1
,
1
),
dtype
=
"float32"
))
expected_output
=
(
numpy
.
array
([
2
,
4
],
dtype
=
"float32"
),
numpy
.
array
([
3
,
7
],
dtype
=
"float32"
))
utt
.
assert_allclose
(
output
,
expected_output
)
class
T_Scan_Cuda
(
unittest
.
TestCase
,
ScanGpuTests
):
"""This class takes the gpu tests for scan that are defined in
class ScanGpuTests and runs them using the cuda backend. It also adds
tests specific to the cuda backend
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
from
theano.sandbox
import
cuda
self
.
gpu_backend
=
cuda
self
.
mode_with_gpu
=
mode_with_opt
.
including
(
'gpu'
,
'scan'
)
super
(
T_Scan_Cuda
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
def
setUp
(
self
):
# Skip the test if cuda is not available
if
not
self
.
gpu_backend
.
cuda_available
:
raise
SkipTest
(
'Optional package cuda disabled'
)
utt
.
seed_rng
()
super
(
T_Scan_Cuda
,
self
)
.
setUp
()
def
is_scan_on_gpu
(
self
,
node
):
return
node
.
op
.
info
.
get
(
'gpu'
,
False
)
def
test_inconsistent_inner_fct
(
self
):
# Test that scan can detect inconsistencies in the inner graph and
# raises an appropriate exception. The pickled file used in this test
# relies on the cuda backend.
# This test has not been extensively tested for Python 3 so it should
# be skipped if python version is >=3
version
=
sys
.
version_info
if
version
>=
(
3
,):
raise
SkipTest
(
"This test relies on a pickled file produced with "
"Python 2. The current python version "
"(
%
i.
%
i.
%
i.
%
i) is >= 3 so the test will be "
"skipped."
%
(
version
.
major
,
version
.
minor
,
version
.
micro
,
version
.
serial
))
# When unpickled, the scan op should perform validation on its inner
# graph, detect the inconsistencies and raise a TypeError
folder
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
path
=
os
.
path
.
join
(
folder
,
"inconsistent_scan.pkl"
)
assert_raises
(
TypeError
,
cPickle
.
load
,
open
(
path
,
"r"
))
def
test_consistent_inner_fct
(
self
):
# Test that scan does not falsely detect inconsistencies in a valid
# inner graph
rs
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
use_cuda
=
True
)
output
,
_
=
theano
.
scan
(
lambda
:
rs
.
uniform
((
3
,),
dtype
=
"float32"
),
n_steps
=
3
)
cPickle
.
loads
(
cPickle
.
dumps
(
output
))
# Also ensure that, after compilation, the Scan has been moved
# on the gpu
fct
=
theano
.
function
([],
output
,
mode
=
self
.
mode_with_gpu
)
scan_nodes
=
scan_nodes_from_fct
(
fct
)
assert
len
(
scan_nodes
)
==
1
assert
self
.
is_scan_on_gpu
(
scan_nodes
[
0
])
class
T_Scan_Gpuarray
(
unittest
.
TestCase
,
ScanGpuTests
):
"""This class takes the gpu tests for scan that are defined in
class ScanGpuTests and runs them using the gpuarray backend.
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
from
theano.sandbox
import
gpuarray
self
.
gpu_backend
=
gpuarray
self
.
mode_with_gpu
=
mode_with_opt
.
including
(
'gpuarray_opt'
,
'scan'
)
super
(
T_Scan_Gpuarray
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
def
setUp
(
self
):
# Skip the test if pygpu is not available
if
not
self
.
gpu_backend
.
pygpu_activated
:
raise
SkipTest
(
'Optional package pygpu disabled'
)
utt
.
seed_rng
()
super
(
T_Scan_Gpuarray
,
self
)
.
setUp
()
def
is_scan_on_gpu
(
self
,
node
):
return
node
.
op
.
info
.
get
(
'gpua'
,
False
)
def
test_speed
():
def
test_speed
():
#
#
# This function prints out the speed of very simple recurrent
# This function prints out the speed of very simple recurrent
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论