Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
500601a2
提交
500601a2
authored
4月 23, 2014
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1807 from abergeron/gpuarray_scan
Make scan work with new backend.
上级
d4f1d4eb
c5dc5576
隐藏空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
447 行增加
和
100 行删除
+447
-100
opt.py
theano/sandbox/cuda/opt.py
+6
-7
basic_ops.py
theano/sandbox/gpuarray/basic_ops.py
+1
-1
opt.py
theano/sandbox/gpuarray/opt.py
+140
-16
subtensor.py
theano/sandbox/gpuarray/subtensor.py
+1
-0
test_opt.py
theano/sandbox/gpuarray/tests/test_opt.py
+23
-21
test_scan.py
theano/sandbox/gpuarray/tests/test_scan.py
+243
-0
scan_op.py
theano/scan_module/scan_op.py
+29
-53
scan_opt.py
theano/scan_module/scan_opt.py
+4
-2
没有找到文件。
theano/sandbox/cuda/opt.py
浏览文件 @
500601a2
...
@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node):
...
@@ -1535,6 +1535,11 @@ def local_gpu_extract_diagonal(node):
gpu_from_host
(
diag_node
.
inputs
[
0
]))]
gpu_from_host
(
diag_node
.
inputs
[
0
]))]
return
False
return
False
def
typeConstructor
(
broadcastable
,
dtype
):
if
dtype
==
'float32'
:
return
CudaNdarrayType
(
broadcastable
=
broadcastable
)
else
:
return
tensor
.
TensorType
(
broadcastable
=
broadcastable
,
dtype
=
dtype
)
@register_opt
(
'scan'
)
@register_opt
(
'scan'
)
@local_optimizer
([
gpu_from_host
,
scan_op
.
Scan
])
@local_optimizer
([
gpu_from_host
,
scan_op
.
Scan
])
...
@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node):
...
@@ -1593,8 +1598,6 @@ def gpuScanOptimization(node):
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
typeConstructor
=
lambda
broadcastable
,
dtype
:
CudaNdarrayType
(
broadcastable
=
broadcastable
)
nw_op
=
scan_op
.
Scan
(
scan_ins
,
nw_op
=
scan_op
.
Scan
(
scan_ins
,
scan_outs
,
scan_outs
,
info
,
info
,
...
@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node):
...
@@ -1642,10 +1645,6 @@ def gpuScanOptimization(node):
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
def
typeConstructor
(
broadcastable
,
dtype
):
assert
dtype
==
'float32'
return
CudaNdarrayType
(
broadcastable
=
broadcastable
)
_outputs
=
scan_op
.
Scan
(
_outputs
=
scan_op
.
Scan
(
scan_ins
,
scan_ins
,
scan_outs
,
scan_outs
,
...
@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node):
...
@@ -1662,7 +1661,7 @@ def gpuScanOptimization(node):
optdb
.
register
(
'gpu_scanOp_make_inplace'
,
optdb
.
register
(
'gpu_scanOp_make_inplace'
,
scan_opt
.
ScanInplaceOptimizer
(
typeConstructor
=
CudaNdarrayType
,
scan_opt
.
ScanInplaceOptimizer
(
typeConstructor
=
typeConstructor
,
gpu_flag
=
True
),
gpu_flag
=
True
),
75
,
75
,
'gpu'
,
'gpu'
,
...
...
theano/sandbox/gpuarray/basic_ops.py
浏览文件 @
500601a2
...
@@ -161,7 +161,7 @@ class HostFromGpu(Op):
...
@@ -161,7 +161,7 @@ class HostFromGpu(Op):
raise
TypeError
(
x
)
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
return
Apply
(
self
,
[
x
],
[
tensor
.
TensorType
(
dtype
=
x
.
dtype
,
[
tensor
.
TensorType
(
dtype
=
x
.
dtype
,
broadcastable
=
x
.
broadcastable
,
)()])
broadcastable
=
x
.
broadcastable
)()])
def
perform
(
self
,
node
,
inp
,
out
):
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
x
,
=
inp
...
...
theano/sandbox/gpuarray/opt.py
浏览文件 @
500601a2
import
copy
import
copy
import
theano
import
theano
import
numpy
import
numpy
from
theano
import
tensor
,
scalar
from
theano
import
tensor
,
scalar
,
gof
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
SequenceDB
,
ProxyDB
,
SequenceDB
,
ProxyDB
,
Optimizer
,
toolbox
,
Optimizer
,
toolbox
,
InconsistencyError
,
EquilibriumOptimizer
)
InconsistencyError
,
EquilibriumOptimizer
)
from
theano.scan_module
import
scan_utils
,
scan_op
,
scan_opt
from
theano.gof.python25
import
all
,
any
from
theano.gof.python25
import
all
,
any
from
theano.tensor.nnet.conv
import
ConvOp
from
theano.tensor.nnet.conv
import
ConvOp
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.basic_ops
import
(
host_from_gpu
,
from
theano.sandbox.gpuarray.basic_ops
import
(
gpu_from_host
,
host_from_gpu
,
gpu_from_host
,
HostFromGpu
,
gpu_alloc
,
gpu_alloc
,
GpuAlloc
,
GpuReshape
,
GpuEye
GpuAlloc
,
)
GpuReshape
,
GpuEye
)
from
theano.sandbox.gpuarray.blas
import
gpu_dot22
,
GpuGemv
,
GpuGemm
,
GpuGer
from
theano.sandbox.gpuarray.blas
import
gpu_dot22
,
GpuGemv
,
GpuGemm
,
GpuGer
from
theano.sandbox.gpuarray.conv
import
GpuConv
from
theano.sandbox.gpuarray.conv
import
GpuConv
from
theano.sandbox.gpuarray.nnet
import
(
GpuCrossentropySoftmaxArgmax1HotWithBias
,
from
theano.sandbox.gpuarray.nnet
import
(
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuSoftmaxWithBias
,
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuSoftmax
)
GpuSoftmaxWithBias
,
GpuSoftmax
)
from
theano.sandbox.gpuarray.elemwise
import
(
GpuElemwise
,
_is_scalar
,
from
theano.sandbox.gpuarray.elemwise
import
(
GpuElemwise
,
_is_scalar
,
GpuDimShuffle
,
GpuCAReduceCuda
)
GpuDimShuffle
,
GpuCAReduceCuda
)
from
theano.sandbox.gpuarray.subtensor
import
GpuIncSubtensor
,
GpuSubtensor
from
theano.sandbox.gpuarray.subtensor
import
GpuIncSubtensor
,
GpuSubtensor
...
@@ -54,6 +55,20 @@ def register_opt(*tags, **kwargs):
...
@@ -54,6 +55,20 @@ def register_opt(*tags, **kwargs):
register_opt
()(
theano
.
tensor
.
opt
.
local_track_shape_i
)
register_opt
()(
theano
.
tensor
.
opt
.
local_track_shape_i
)
def
safe_to_gpu
(
x
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
gpu_from_host
(
x
)
else
:
return
x
def
safe_to_cpu
(
x
):
if
isinstance
(
x
.
type
,
GpuArrayType
):
return
host_from_gpu
(
x
)
else
:
return
x
def
op_lifter
(
OP
):
def
op_lifter
(
OP
):
"""
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
...
@@ -73,10 +88,10 @@ def op_lifter(OP):
...
@@ -73,10 +88,10 @@ def op_lifter(OP):
# This is needed as sometimes new_op inherit from OP.
# This is needed as sometimes new_op inherit from OP.
if
new_op
and
new_op
!=
node
.
op
:
if
new_op
and
new_op
!=
node
.
op
:
if
isinstance
(
new_op
,
theano
.
Op
):
if
isinstance
(
new_op
,
theano
.
Op
):
return
[
host_from_g
pu
(
o
)
for
o
in
return
[
safe_to_c
pu
(
o
)
for
o
in
new_op
(
*
node
.
inputs
,
return_list
=
True
)]
new_op
(
*
node
.
inputs
,
return_list
=
True
)]
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
return
[
host_from_g
pu
(
o
)
for
o
in
new_op
]
return
[
safe_to_c
pu
(
o
)
for
o
in
new_op
]
else
:
# suppose it is a variable on the GPU
else
:
# suppose it is a variable on the GPU
return
[
host_from_gpu
(
new_op
)]
return
[
host_from_gpu
(
new_op
)]
return
False
return
False
...
@@ -132,7 +147,17 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
...
@@ -132,7 +147,17 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt
()
@register_opt
()
@op_lifter
([
tensor
.
Alloc
])
@op_lifter
([
tensor
.
Alloc
])
def
local_gpualloc
(
node
):
def
local_gpualloc
(
node
):
return
gpu_alloc
new_out
=
gpu_alloc
(
*
node
.
inputs
)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if
new_out
.
broadcastable
!=
node
.
outputs
[
0
]
.
broadcastable
:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for
b_old
,
b_new
in
zip
(
node
.
outputs
[
0
]
.
broadcastable
,
new_out
.
broadcastable
):
assert
b_new
or
(
not
b_old
)
new_out
=
tensor
.
patternbroadcast
(
new_out
,
node
.
outputs
[
0
]
.
broadcastable
)
return
(
new_out
,)
@register_opt
()
@register_opt
()
...
@@ -158,6 +183,13 @@ def local_gpureshape(node):
...
@@ -158,6 +183,13 @@ def local_gpureshape(node):
return
res
return
res
@register_opt
()
@op_lifter
([
tensor
.
Rebroadcast
])
def
local_gpu_rebroadcast
(
node
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
@register_opt
()
@register_opt
()
@op_lifter
([
tensor
.
Flatten
])
@op_lifter
([
tensor
.
Flatten
])
def
local_gpuflatten
(
node
):
def
local_gpuflatten
(
node
):
...
@@ -176,8 +208,6 @@ def local_gpuflatten(node):
...
@@ -176,8 +208,6 @@ def local_gpuflatten(node):
def
local_gpu_elemwise
(
node
):
def
local_gpu_elemwise
(
node
):
op
=
node
.
op
op
=
node
.
op
name
=
op
.
name
name
=
op
.
name
if
node
.
outputs
[
0
]
.
ndim
==
0
:
return
if
name
:
if
name
:
name
=
'Gpu'
+
name
name
=
'Gpu'
+
name
res
=
GpuElemwise
(
op
.
scalar_op
,
name
=
name
,
res
=
GpuElemwise
(
op
.
scalar_op
,
name
=
name
,
...
@@ -432,3 +462,97 @@ def local_gpu_conv(node):
...
@@ -432,3 +462,97 @@ def local_gpu_conv(node):
out
=
gpu_from_host
(
out
)
out
=
gpu_from_host
(
out
)
out
.
values_eq_approx
=
values_eq_approx
out
.
values_eq_approx
=
values_eq_approx
return
[
out
]
return
[
out
]
def
tensor_to_gpu
(
x
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
y
=
GpuArrayType
(
broadcastable
=
x
.
type
.
broadcastable
,
dtype
=
x
.
type
.
dtype
)()
if
x
.
name
:
y
.
name
=
x
.
name
+
'[Gpua]'
return
y
else
:
return
x
def
gpu_safe_new
(
x
,
tag
=
''
):
"""
Internal function that constructs a new variable from x with the same
type, but with a different name ( old name + tag). This function is used
by gradient, or the R-op to construct new variables for the inputs of
the inner graph such that there is no interference between the original
graph and the newly constructed graph.
"""
if
hasattr
(
x
,
'name'
)
and
x
.
name
is
not
None
:
nw_name
=
x
.
name
+
tag
else
:
nw_name
=
None
if
isinstance
(
x
,
theano
.
Constant
):
return
x
.
clone
()
nw_x
=
x
.
type
()
nw_x
.
name
=
nw_name
return
nw_x
def
gpu_reconstruct_graph
(
inputs
,
outputs
,
tag
=
None
):
"""
Different interface to clone, that allows you to pass inputs.
Compared to clone, this method always replaces the inputs with
new variables of the same type, and returns those ( in the same
order as the original inputs).
"""
if
tag
is
None
:
tag
=
''
nw_inputs
=
[
gpu_safe_new
(
x
,
tag
)
for
x
in
inputs
]
givens
=
{}
for
nw_x
,
x
in
zip
(
nw_inputs
,
inputs
):
givens
[
x
]
=
nw_x
nw_outputs
=
scan_utils
.
clone
(
outputs
,
replace
=
givens
)
return
(
nw_inputs
,
nw_outputs
)
@register_opt
(
'scan'
)
@op_lifter
([
scan_op
.
Scan
])
def
local_scan_to_gpua
(
node
):
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
info
[
'gpua'
]
=
True
nw_ins
=
[
node
.
inputs
[
0
]]
e
=
(
1
+
node
.
op
.
n_seqs
+
node
.
op
.
n_mit_mot
+
node
.
op
.
n_mit_sot
+
node
.
op
.
n_sit_sot
+
node
.
op
.
n_shared_outs
)
nw_ins
+=
[
safe_to_gpu
(
x
)
for
x
in
node
.
inputs
[
1
:
e
]]
b
=
e
e
=
e
+
node
.
op
.
n_nit_sot
nw_ins
+=
node
.
inputs
[
b
:
e
]
nw_ins
+=
[
safe_to_gpu
(
x
)
for
x
in
node
.
inputs
[
e
:]]
scan_ins
=
[
tensor_to_gpu
(
x
)
for
x
in
node
.
op
.
inputs
]
scan_outs
=
[
safe_to_gpu
(
x
)
for
x
in
node
.
op
.
outputs
]
scan_outs
=
scan_utils
.
clone
(
scan_outs
,
replace
=
zip
(
node
.
op
.
inputs
,
[
safe_to_cpu
(
x
)
for
x
in
scan_ins
]))
# We need to construct the hash here, because scan
# __init__ does not know about the gpu and can not
# handle graphs with inputs being on the gpu
tmp_in
,
tmp_out
=
gpu_reconstruct_graph
(
scan_ins
,
scan_outs
)
local_fgraph
=
gof
.
FunctionGraph
(
tmp_in
,
tmp_out
,
clone
=
False
)
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
info
[
'gpu_hash'
]
=
hash
(
_cmodule_key
)
nw_op
=
scan_op
.
Scan
(
scan_ins
,
scan_outs
,
info
,
typeConstructor
=
GpuArrayType
)
.
make_node
(
*
nw_ins
)
return
nw_op
.
outputs
optdb
.
register
(
'gpua_scanOp_make_inplace'
,
scan_opt
.
ScanInplaceOptimizer
(
typeConstructor
=
GpuArrayType
,
gpua_flag
=
True
),
75
,
'gpua'
,
'fast_run'
,
'inplace'
,
'scan'
)
theano/sandbox/gpuarray/subtensor.py
浏览文件 @
500601a2
...
@@ -7,6 +7,7 @@ import theano
...
@@ -7,6 +7,7 @@ import theano
from
theano
import
tensor
,
gof
from
theano
import
tensor
,
gof
from
theano.gof.python25
import
all
,
any
from
theano.gof.python25
import
all
,
any
from
theano.tensor.subtensor
import
IncSubtensor
,
Subtensor
,
get_idx_list
from
theano.tensor.subtensor
import
IncSubtensor
,
Subtensor
,
get_idx_list
import
theano.tensor.inplace
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
try
:
try
:
...
...
theano/sandbox/gpuarray/tests/test_opt.py
浏览文件 @
500601a2
import
numpy
import
numpy
import
theano
import
theano
from
theano
import
tensor
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.gpuarray
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.basic_ops
import
GpuAlloc
,
GpuReshape
,
gpu_alloc
from
theano.sandbox.gpuarray.basic_ops
import
GpuAlloc
,
GpuReshape
,
gpu_alloc
from
theano.sandbox.gpuarray.elemwise
import
GpuCAReduceCuda
from
theano.sandbox.gpuarray.elemwise
import
GpuCAReduceCuda
import
theano.sandbox.gpuarray
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
(
rand_gpuarray
,
mode_with_gpu
,
mode_without_gpu
)
from
theano.tests.unittest_tools
import
SkipTest
from
theano.tests.unittest_tools
import
SkipTest
if
theano
.
sandbox
.
gpuarray
.
pygpu
is
None
:
raise
SkipTest
(
"pygpu not installed"
)
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
and
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
not
cuda_ndarray
.
use
.
device_number
:
cuda_ndarray
.
use
(
'gpu'
)
theano
.
sandbox
.
gpuarray
.
init_dev
(
'cuda'
)
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
raise
SkipTest
(
"pygpu disabled"
)
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
else
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
def
test_flatten
():
def
test_flatten
():
m
=
theano
.
tensor
.
fmatrix
()
m
=
theano
.
tensor
.
fmatrix
()
f
=
theano
.
function
([
m
],
m
.
flatten
(),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
m
],
m
.
flatten
(),
mode
=
mode_with_gpu
)
...
@@ -104,3 +89,20 @@ def test_local_gpualloc_memset_0():
...
@@ -104,3 +89,20 @@ def test_local_gpualloc_memset_0():
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
numpy
.
asarray
(
f
(
2
))
==
1
)
.
all
()
assert
(
numpy
.
asarray
(
f
(
2
))
==
1
)
.
all
()
def
test_rebroadcast
():
d
=
numpy
.
random
.
rand
(
10
,
10
)
.
astype
(
'float32'
)
v
=
theano
.
tensor
.
fmatrix
()
up
=
tensor
.
unbroadcast
(
v
.
sum
()
.
dimshuffle
(
'x'
,
'x'
),
0
,
1
)
f
=
theano
.
function
([
v
],
[
up
],
mode
=
mode_with_gpu
)
f
(
d
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
rebrs
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
tensor
.
Rebroadcast
)]
assert
len
(
rebrs
)
==
1
rebr
=
rebrs
[
0
]
assert
isinstance
(
rebr
.
inputs
[
0
]
.
type
,
GpuArrayType
)
assert
isinstance
(
rebr
.
outputs
[
0
]
.
type
,
GpuArrayType
)
theano/sandbox/gpuarray/tests/test_scan.py
0 → 100644
浏览文件 @
500601a2
from
unittest
import
TestCase
import
numpy
import
theano
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.rng_mrg
from
theano.sandbox.gpuarray.basic_ops
import
(
gpu_from_host
,
GpuFromHost
,
HostFromGpu
)
from
theano.sandbox.gpuarray.elemwise
import
GpuElemwise
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
mode_with_gpu
class
T_Scan
(
TestCase
):
def
setUp
(
self
):
utt
.
seed_rng
()
def
test_one_sequence_one_output_weights_gpu1
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
mode
=
mode_with_gpu
.
excluding
(
'InputToGpuOptimizer'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode
)
output
=
gpu_from_host
(
output
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode
)
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
v_u
=
numpy
.
asarray
(
v_u
,
dtype
=
'float32'
)
v_x0
=
numpy
.
asarray
(
v_x0
,
dtype
=
'float32'
)
W
=
numpy
.
asarray
(
W
,
dtype
=
'float32'
)
W_in
=
numpy
.
asarray
(
W_in
,
dtype
=
'float32'
)
# compute the output in numpy
v_out
=
numpy
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
xrange
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
# TO DEL
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
topo
])
==
0
assert
sum
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
([
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
scan_node_topo
])
# This second version test the second case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu2
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out
=
numpy
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
xrange
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
theano_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_values
,
v_out
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
topo
])
==
1
assert
sum
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
([
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
scan_node_topo
])
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def
test_gpu3_mixture_dtype_outputs
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
(
u_t
*
W_in
+
x_tm1
*
W
,
theano
.
tensor
.
cast
(
u_t
+
x_tm1
,
'int64'
))
u
=
theano
.
tensor
.
fvector
(
'u'
)
x0
=
theano
.
tensor
.
fscalar
(
'x0'
)
W_in
=
theano
.
tensor
.
fscalar
(
'win'
)
W
=
theano
.
tensor
.
fscalar
(
'w'
)
output
,
updates
=
theano
.
scan
(
f_rnn
,
u
,
[
x0
,
None
],
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
)
f2
=
theano
.
function
([
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# get random initial values
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.
,
high
=
5.
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out1
=
numpy
.
zeros
((
4
,))
v_out2
=
numpy
.
zeros
((
4
,),
dtype
=
'int64'
)
v_out1
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
v_out2
[
0
]
=
v_u
[
0
]
+
v_x0
for
step
in
xrange
(
1
,
4
):
v_out1
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out1
[
step
-
1
]
*
W
v_out2
[
step
]
=
numpy
.
int64
(
v_u
[
step
]
+
v_out1
[
step
-
1
])
theano_out1
,
theano_out2
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
theano_out1
,
v_out1
)
utt
.
assert_allclose
(
theano_out2
,
v_out2
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
theano
.
scan_module
.
scan_op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
assert
scan_node
.
op
.
gpua
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
not
any
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
scan_node_topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
scan_node_topo
])
def
test_gpu4_gibbs_chain
(
self
):
rng
=
numpy
.
random
.
RandomState
(
utt
.
fetch_seed
())
v_vsample
=
numpy
.
array
(
rng
.
binomial
(
1
,
.
5
,
size
=
(
3
,
20
),),
dtype
=
'float32'
)
vsample
=
theano
.
shared
(
v_vsample
)
trng
=
theano
.
sandbox
.
rng_mrg
.
MRG_RandomStreams
(
utt
.
fetch_seed
())
def
f
(
vsample_tm1
):
return
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
dtype
=
'float32'
)
*
vsample_tm1
theano_vsamples
,
updates
=
theano
.
scan
(
f
,
[],
vsample
,
[],
n_steps
=
10
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
)
my_f
=
theano
.
function
([],
theano_vsamples
[
-
1
],
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
# I leave this to tested by debugmode, this test was anyway
# more of does the graph compile kind of test
t_result
=
my_f
()
theano/scan_module/scan_op.py
浏览文件 @
500601a2
...
@@ -56,23 +56,24 @@ class Scan(PureOp):
...
@@ -56,23 +56,24 @@ class Scan(PureOp):
the scan op (like number of different types of
the scan op (like number of different types of
arguments, name, mode, if it should run on GPU or
arguments, name, mode, if it should run on GPU or
not, etc.)
not, etc.)
:param typeConstructor: function that constructs a Theano TensorType
:param typeConstructor: function that constructs an equivalent
able to represent a float32 ndarray.
to Theano TensorType
Note: ``typeConstructor`` had been added to refactor how Theano
deals with the GPU. If it runs on the GPU, scan needs to construct
Note: ``typeConstructor`` had been added to refactor how
certain outputs (those who reside in the GPU memory) as CudaNdarray.
Theano deals with the GPU. If it runs on the GPU, scan needs
However we can not import cuda in this file (as it is in sandbox,
to construct certain outputs (those who reside in the GPU
and not available on each machine) so the workaround is that the GPU
memory) as the GPU-specific type. However we can not import
optimization (which is aware of cuda types) passes to the
gpu code in this file (as it is in sandbox, and not available
constructor of this class a function that is able to construct
on each machine) so the workaround is that the GPU
CudaNdarray. This way the class Scan does not need to be aware of
optimization passes to the constructor of this class a
CudaNdarray, it just constructs any float32 tensor using this
function that is able to construct a GPU type. This way the
function (which by default constructs normal tensors). Note that the
class Scan does not need to be aware of the details for the
second assumption in this code is that any float32 output or input
GPU, it just constructs any tensor using this function (which
will be moved on the GPU if the optimization gets applied (following
by default constructs normal tensors).
Theano's philosophy of moving as much as possible on gpu).
"""
"""
if
'gpua'
not
in
info
:
info
[
'gpua'
]
=
False
# adding properties into self
# adding properties into self
self
.
inputs
=
inputs
self
.
inputs
=
inputs
self
.
outputs
=
outputs
self
.
outputs
=
outputs
...
@@ -95,23 +96,10 @@ class Scan(PureOp):
...
@@ -95,23 +96,10 @@ class Scan(PureOp):
# Not that for mit_mot there are several output slices per
# Not that for mit_mot there are several output slices per
# output sequence
# output sequence
o
=
outputs
[
idx
]
o
=
outputs
[
idx
]
# Scan assumes that only variables of dtype float32 might need a
self
.
output_types
.
append
(
# special constructor (i.e. CudaNdarray constructor) when the
typeConstructor
(
# code is running on GPU, as it is the only type supported by
broadcastable
=
(
False
,)
+
o
.
type
.
broadcastable
,
# Theano yet. Therefore only for dtype float32 we use the passed
dtype
=
o
.
type
.
dtype
))
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if
o
.
type
.
dtype
in
[
'float32'
]:
self
.
output_types
.
append
(
typeConstructor
(
broadcastable
=
(
False
,)
+
o
.
type
.
broadcastable
,
dtype
=
o
.
type
.
dtype
))
else
:
self
.
output_types
.
append
(
tensorConstructor
(
broadcastable
=
(
False
,)
+
o
.
type
.
broadcastable
,
dtype
=
o
.
type
.
dtype
))
idx
+=
len
(
self
.
mit_mot_out_slices
[
jdx
])
idx
+=
len
(
self
.
mit_mot_out_slices
[
jdx
])
jdx
+=
1
jdx
+=
1
...
@@ -120,23 +108,11 @@ class Scan(PureOp):
...
@@ -120,23 +108,11 @@ class Scan(PureOp):
end
=
idx
+
self
.
n_mit_sot
+
self
.
n_sit_sot
+
self
.
n_nit_sot
end
=
idx
+
self
.
n_mit_sot
+
self
.
n_sit_sot
+
self
.
n_nit_sot
for
o
in
outputs
[
idx
:
end
]:
for
o
in
outputs
[
idx
:
end
]:
# Scan assumes that only variables of dtype float32 might need a
self
.
output_types
.
append
(
# special constructor (i.e. CudaNdarray constructor) when the
typeConstructor
(
# code is running on GPU, as it is the only type supported by
broadcastable
=
(
False
,)
+
o
.
type
.
broadcastable
,
# Theano yet. Therefore only for dtype float32 we use the passed
dtype
=
o
.
type
.
dtype
))
# type constructor ``typeConstructor``. For anything else we
# know that even if we run it on the GPU we still construct
# normal Theano tensors.
if
o
.
type
.
dtype
in
[
'float32'
]:
self
.
output_types
.
append
(
typeConstructor
(
broadcastable
=
(
False
,)
+
o
.
type
.
broadcastable
,
dtype
=
o
.
type
.
dtype
))
else
:
self
.
output_types
.
append
(
tensorConstructor
(
broadcastable
=
(
False
,)
+
o
.
type
.
broadcastable
,
dtype
=
o
.
type
.
dtype
))
# shared outputs + possibly the ending condition
# shared outputs + possibly the ending condition
for
o
in
outputs
[
end
:]:
for
o
in
outputs
[
end
:]:
self
.
output_types
.
append
(
o
.
type
)
self
.
output_types
.
append
(
o
.
type
)
...
@@ -182,14 +158,14 @@ class Scan(PureOp):
...
@@ -182,14 +158,14 @@ class Scan(PureOp):
self
.
n_shared_outs
)
self
.
n_shared_outs
)
self
.
n_outs
=
self
.
n_mit_mot
+
self
.
n_mit_sot
+
self
.
n_sit_sot
self
.
n_outs
=
self
.
n_mit_mot
+
self
.
n_mit_sot
+
self
.
n_sit_sot
self
.
n_tap_outs
=
self
.
n_mit_mot
+
self
.
n_mit_sot
self
.
n_tap_outs
=
self
.
n_mit_mot
+
self
.
n_mit_sot
if
not
self
.
info
[
'gpu'
]:
if
self
.
info
[
'gpu'
]
or
self
.
info
[
'gpua'
]:
self
.
_hash_inner_graph
=
self
.
info
[
'gpu_hash'
]
else
:
tmp_in
,
tmp_out
=
scan_utils
.
reconstruct_graph
(
self
.
inputs
,
tmp_in
,
tmp_out
=
scan_utils
.
reconstruct_graph
(
self
.
inputs
,
self
.
outputs
)
self
.
outputs
)
local_fgraph
=
gof
.
FunctionGraph
(
tmp_in
,
tmp_out
,
clone
=
False
)
local_fgraph
=
gof
.
FunctionGraph
(
tmp_in
,
tmp_out
,
clone
=
False
)
self
.
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
self
.
_cmodule_key
=
gof
.
CLinker
()
.
cmodule_key_
(
local_fgraph
,
[])
self
.
_hash_inner_graph
=
hash
(
self
.
_cmodule_key
)
self
.
_hash_inner_graph
=
hash
(
self
.
_cmodule_key
)
else
:
self
.
_hash_inner_graph
=
self
.
info
[
'gpu_hash'
]
def
make_node
(
self
,
*
inputs
):
def
make_node
(
self
,
*
inputs
):
"""
"""
...
...
theano/scan_module/scan_opt.py
浏览文件 @
500601a2
...
@@ -537,10 +537,11 @@ class PushOutSeqScan(gof.Optimizer):
...
@@ -537,10 +537,11 @@ class PushOutSeqScan(gof.Optimizer):
class
ScanInplaceOptimizer
(
Optimizer
):
class
ScanInplaceOptimizer
(
Optimizer
):
"""Graph optimizer for Scan(makes it run inplace)"""
"""Graph optimizer for Scan(makes it run inplace)"""
def
__init__
(
self
,
typeConstructor
=
None
,
gpu_flag
=
False
):
def
__init__
(
self
,
typeConstructor
=
None
,
gpu_flag
=
False
,
gpua_flag
=
False
):
Optimizer
.
__init__
(
self
)
Optimizer
.
__init__
(
self
)
self
.
typeConstructor
=
typeConstructor
self
.
typeConstructor
=
typeConstructor
self
.
gpu_flag
=
gpu_flag
self
.
gpu_flag
=
gpu_flag
self
.
gpua_flag
=
gpua_flag
def
add_requirements
(
self
,
fgraph
):
def
add_requirements
(
self
,
fgraph
):
fgraph
.
attach_feature
(
toolbox
.
ReplaceValidate
())
fgraph
.
attach_feature
(
toolbox
.
ReplaceValidate
())
...
@@ -551,7 +552,8 @@ class ScanInplaceOptimizer(Optimizer):
...
@@ -551,7 +552,8 @@ class ScanInplaceOptimizer(Optimizer):
nodes
=
fgraph
.
toposort
()
nodes
=
fgraph
.
toposort
()
scan_nodes
=
[
x
for
x
in
nodes
scan_nodes
=
[
x
for
x
in
nodes
if
(
isinstance
(
x
.
op
,
scan_op
.
Scan
)
and
if
(
isinstance
(
x
.
op
,
scan_op
.
Scan
)
and
x
.
op
.
info
[
'gpu'
]
==
self
.
gpu_flag
)]
x
.
op
.
info
[
'gpu'
]
==
self
.
gpu_flag
and
x
.
op
.
info
[
'gpua'
]
==
self
.
gpua_flag
)]
for
scan_idx
in
xrange
(
len
(
scan_nodes
)):
for
scan_idx
in
xrange
(
len
(
scan_nodes
)):
node
=
scan_nodes
[
scan_idx
]
node
=
scan_nodes
[
scan_idx
]
op
=
node
.
op
op
=
node
.
op
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论