Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c022347b
提交
c022347b
authored
9月 16, 2014
作者:
abergeron
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1967 from nouiz/fast_compile_gpu
[WIP] Fast compile gpu
上级
1563ea38
c4fed2b2
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
156 行增加
和
94 行删除
+156
-94
opt.py
theano/gof/opt.py
+1
-2
__init__.py
theano/sandbox/cuda/__init__.py
+1
-0
opt.py
theano/sandbox/cuda/opt.py
+24
-16
test_opt.py
theano/sandbox/cuda/tests/test_opt.py
+26
-0
opt.py
theano/sandbox/gpuarray/opt.py
+48
-41
test_opt.py
theano/sandbox/gpuarray/tests/test_opt.py
+2
-1
blas.py
theano/tensor/blas.py
+3
-2
nnet.py
theano/tensor/nnet/nnet.py
+18
-11
opt.py
theano/tensor/opt.py
+33
-21
没有找到文件。
theano/gof/opt.py
浏览文件 @
c022347b
...
@@ -1837,8 +1837,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
...
@@ -1837,8 +1837,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
process_count
[
process
]
+=
count
process_count
[
process
]
+=
count
else
:
else
:
process_count
[
process
]
=
count
process_count
[
process
]
=
count
for
i
in
range
(
len
(
loop_process_count
),
len
(
prof2
[
2
])):
loop_process_count
.
extend
(
prof2
[
2
][
len
(
loop_process_count
):])
loop_process_count
.
append
(
list
(
prof2
[
2
]))
max_nb_nodes
=
max
(
prof1
[
3
],
prof2
[
3
])
max_nb_nodes
=
max
(
prof1
[
3
],
prof2
[
3
])
...
...
theano/sandbox/cuda/__init__.py
浏览文件 @
c022347b
...
@@ -415,6 +415,7 @@ def use(device,
...
@@ -415,6 +415,7 @@ def use(device,
if
default_to_move_computation_to_gpu
:
if
default_to_move_computation_to_gpu
:
optdb
.
add_tags
(
'gpu_opt'
,
optdb
.
add_tags
(
'gpu_opt'
,
'fast_compile'
,
'fast_run'
,
'fast_run'
,
'inplace'
)
'inplace'
)
optdb
.
add_tags
(
'gpu_after_fusion'
,
optdb
.
add_tags
(
'gpu_after_fusion'
,
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
c022347b
...
@@ -55,10 +55,10 @@ gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
...
@@ -55,10 +55,10 @@ gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
gpu_cut_copies
=
EquilibriumDB
()
gpu_cut_copies
=
EquilibriumDB
()
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
.
register
(
'gpu_local_optimizations'
,
gpu_optimizer
,
1
,
gpu_seqopt
.
register
(
'gpu_local_optimizations'
,
gpu_optimizer
,
1
,
'fast_run'
,
'inplace'
,
'gpu'
)
'fast_run'
,
'
fast_compile'
,
'
inplace'
,
'gpu'
)
gpu_seqopt
.
register
(
'gpu_cut_transfers'
,
gpu_cut_copies
,
2
,
gpu_seqopt
.
register
(
'gpu_cut_transfers'
,
gpu_cut_copies
,
2
,
'fast_run'
,
'gpu'
)
'fast_run'
,
'
fast_compile'
,
'
gpu'
)
# DO NOT PUT fast_run in gpu_opt! This will ALWAYS enable the GPU!
# DO NOT PUT fast_run
or fast_compile
in gpu_opt! This will ALWAYS enable the GPU!
optdb
.
register
(
'gpu_opt'
,
optdb
.
register
(
'gpu_opt'
,
gpu_seqopt
,
gpu_seqopt
,
optdb
.
__position__
.
get
(
'add_destroy_handler'
,
49.5
)
-
1
,
optdb
.
__position__
.
get
(
'add_destroy_handler'
,
49.5
)
-
1
,
...
@@ -72,13 +72,15 @@ optdb.register('gpu_after_fusion',
...
@@ -72,13 +72,15 @@ optdb.register('gpu_after_fusion',
'gpu'
)
'gpu'
)
## Register merge_optimizer as a global opt
## Register merge_optimizer as a global opt
gpu_optimizer
.
register
(
'gpu_merge'
,
theano
.
gof
.
opt
.
merge_optimizer
,
'fast_run'
)
gpu_optimizer
.
register
(
'gpu_merge'
,
theano
.
gof
.
opt
.
merge_optimizer
,
'fast_run'
,
'fast_compile'
)
def
register_opt
(
*
tags
,
**
kwargs
):
def
register_opt
(
*
tags
,
**
kwargs
):
def
f
(
local_opt
):
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
gpu_optimizer
.
register
(
name
,
local_opt
,
'fast_run'
,
'gpu'
,
*
tags
)
gpu_optimizer
.
register
(
name
,
local_opt
,
'fast_run'
,
'fast_compile'
,
'gpu'
,
*
tags
)
return
local_opt
return
local_opt
return
f
return
f
...
@@ -163,14 +165,15 @@ def local_cut_gpu_host_gpu(node):
...
@@ -163,14 +165,15 @@ def local_cut_gpu_host_gpu(node):
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
return
False
return
False
gpu_cut_copies
.
register
(
'cut_gpu_host_transfers'
,
local_cut_gpu_host_gpu
,
gpu_cut_copies
.
register
(
'cut_gpu_host_transfers'
,
local_cut_gpu_host_gpu
,
'fast_run
'
,
'gpu'
)
'fast_run'
,
'fast_compile
'
,
'gpu'
)
gpu_cut_copies
.
register
(
'cut_gpu_constant_transfers'
,
gpu_cut_copies
.
register
(
'cut_gpu_constant_transfers'
,
tensor
.
opt
.
constant_folding
,
tensor
.
opt
.
constant_folding
,
'fast_run'
,
'gpu'
)
'fast_run'
,
'
fast_compile'
,
'
gpu'
)
#register it into canonicalize to allow other optimization to work without
#register it into canonicalize to allow other optimization to work without
#botering with this useless pattern.
#botering with this useless pattern.
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpu_host_gpu'
,
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpu_host_gpu'
,
local_cut_gpu_host_gpu
,
'fast_run'
,
'gpu'
)
local_cut_gpu_host_gpu
,
'fast_run'
,
'fast_compile'
,
'gpu'
)
# 'float64', 'complex128' and 'complex64' are not supported in elemwise
# 'float64', 'complex128' and 'complex64' are not supported in elemwise
# on the gpu.
# on the gpu.
...
@@ -347,7 +350,7 @@ def local_gpu_specifyShape_0(node):
...
@@ -347,7 +350,7 @@ def local_gpu_specifyShape_0(node):
@register_opt
()
@register_opt
()
@local_optimizer
([
gpu_from_host
])
# XXX: broken: tensor.basic.dot is not an op
@local_optimizer
([
gpu_from_host
,
tensor
.
basic
.
Dot
])
def
local_gpu_dot_to_dot22
(
node
):
def
local_gpu_dot_to_dot22
(
node
):
"""
"""
gpu_from_host(dot) -> gpudot(gpu_from_host)
gpu_from_host(dot) -> gpudot(gpu_from_host)
...
@@ -358,6 +361,8 @@ def local_gpu_dot_to_dot22(node):
...
@@ -358,6 +361,8 @@ def local_gpu_dot_to_dot22(node):
the output.
the output.
A more suitable solution would be to use the right cublas call
A more suitable solution would be to use the right cublas call
This is needed in fast_compile
"""
"""
# In case the got do input upcast, we much check that we can
# In case the got do input upcast, we much check that we can
...
@@ -366,17 +371,18 @@ def local_gpu_dot_to_dot22(node):
...
@@ -366,17 +371,18 @@ def local_gpu_dot_to_dot22(node):
if
node
.
outputs
[
0
]
.
type
.
dtype
!=
'float32'
:
if
node
.
outputs
[
0
]
.
type
.
dtype
!=
'float32'
:
return
False
return
False
host_input
=
node
.
inputs
[
0
]
host_input
=
node
.
inputs
[
0
]
if
host_input
.
owner
and
host_input
.
owner
.
op
==
tensor
.
basic
.
dot
:
if
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
basic
.
Dot
):
x
,
y
=
host_input
.
owner
.
inputs
x
,
y
=
host_input
.
owner
.
inputs
# case one: vector X matrix
# case one: vector X matrix
if
_is_real_vector
(
x
)
and
_is_real_matrix
(
y
):
if
_is_real_vector
(
x
)
and
_is_real_matrix
(
y
):
new_op
=
GpuDimShuffle
((
False
,),
[
'x'
,
0
]
)
new_op
=
GpuDimShuffle
((
False
,),
(
'x'
,
0
)
)
shape_out
=
y
.
shape
[
1
]
.
dimshuffle
([
'x'
])
shape_out
=
y
.
shape
[
1
]
.
dimshuffle
([
'x'
])
gpu_x
=
new_op
(
gpu_from_host
(
x
))
gpu_x
=
new_op
(
gpu_from_host
(
x
))
gpu_y
=
gpu_from_host
(
y
)
gpu_y
=
gpu_from_host
(
y
)
# case two: matrix X vector
# case two: matrix X vector
elif
_is_real_matrix
(
x
)
and
_is_real_vector
(
y
):
elif
_is_real_matrix
(
x
)
and
_is_real_vector
(
y
):
new_op
=
GpuDimShuffle
((
False
,),
[
0
,
'x'
]
)
new_op
=
GpuDimShuffle
((
False
,),
(
0
,
'x'
)
)
shape_out
=
x
.
shape
[
0
]
.
dimshuffle
([
'x'
])
shape_out
=
x
.
shape
[
0
]
.
dimshuffle
([
'x'
])
gpu_x
=
gpu_from_host
(
x
)
gpu_x
=
gpu_from_host
(
x
)
gpu_y
=
new_op
(
gpu_from_host
(
y
))
gpu_y
=
new_op
(
gpu_from_host
(
y
))
...
@@ -384,20 +390,20 @@ def local_gpu_dot_to_dot22(node):
...
@@ -384,20 +390,20 @@ def local_gpu_dot_to_dot22(node):
return
False
return
False
return
[
GpuReshape
(
1
)(
gpu_dot22
(
gpu_x
,
gpu_y
),
shape_out
)]
return
[
GpuReshape
(
1
)(
gpu_dot22
(
gpu_x
,
gpu_y
),
shape_out
)]
if
node
.
op
==
tensor
.
basic
.
dot
:
if
isinstance
(
node
.
op
,
tensor
.
basic
.
Dot
)
:
if
node
.
outputs
[
0
]
.
type
.
dtype
!=
'float32'
:
if
node
.
outputs
[
0
]
.
type
.
dtype
!=
'float32'
:
return
False
return
False
if
any
([
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
)
if
any
([
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
)
for
i
in
node
.
inputs
]):
for
i
in
node
.
inputs
]):
x
,
y
=
node
.
inputs
x
,
y
=
node
.
inputs
if
_is_real_vector
(
x
)
and
_is_real_matrix
(
y
):
if
_is_real_vector
(
x
)
and
_is_real_matrix
(
y
):
new_op
=
GpuDimShuffle
((
False
,),
[
'x'
,
0
]
)
new_op
=
GpuDimShuffle
((
False
,),
(
'x'
,
0
)
)
shape_out
=
y
.
shape
[
1
]
.
dimshuffle
([
'x'
])
shape_out
=
y
.
shape
[
1
]
.
dimshuffle
([
'x'
])
gpu_x
=
new_op
(
gpu_from_host
(
x
))
gpu_x
=
new_op
(
gpu_from_host
(
x
))
gpu_y
=
gpu_from_host
(
y
)
gpu_y
=
gpu_from_host
(
y
)
elif
_is_real_matrix
(
x
)
and
_is_real_vector
(
y
):
elif
_is_real_matrix
(
x
)
and
_is_real_vector
(
y
):
new_op
=
GpuDimShuffle
((
False
,),
[
0
,
'x'
]
)
new_op
=
GpuDimShuffle
((
False
,),
(
0
,
'x'
)
)
shape_out
=
x
.
shape
[
0
]
.
dimshuffle
([
'x'
])
shape_out
=
x
.
shape
[
0
]
.
dimshuffle
([
'x'
])
gpu_x
=
gpu_from_host
(
x
)
gpu_x
=
gpu_from_host
(
x
)
gpu_y
=
new_op
(
gpu_from_host
(
y
))
gpu_y
=
new_op
(
gpu_from_host
(
y
))
...
@@ -1629,8 +1635,10 @@ else:
...
@@ -1629,8 +1635,10 @@ else:
#GpuElemwise inplace
#GpuElemwise inplace
gpu_inplace_elemwise_optimizer
=
tensor
.
opt
.
inplace_elemwise_optimizer_op
(
gpu_inplace_elemwise_optimizer
=
tensor
.
opt
.
inplace_elemwise_optimizer_op
(
GpuElemwise
)
GpuElemwise
)
# DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
# It still will be run in fast_run with device=gpu with the current tag.
optdb
.
register
(
'gpu_inplace_elemwise_opt'
,
gpu_inplace_elemwise_optimizer
,
75
,
optdb
.
register
(
'gpu_inplace_elemwise_opt'
,
gpu_inplace_elemwise_optimizer
,
75
,
'fast_run'
,
'inplace'
,
'gpu_inplace'
,
'gpu'
)
'fast_run'
,
'inplace'
,
'gpu_inplace'
)
@register_opt
()
@register_opt
()
...
...
theano/sandbox/cuda/tests/test_opt.py
浏览文件 @
c022347b
...
@@ -404,6 +404,32 @@ def test_erfinvgpu():
...
@@ -404,6 +404,32 @@ def test_erfinvgpu():
assert
numpy
.
allclose
(
f
(
xv
),
f2
(
xv
))
assert
numpy
.
allclose
(
f
(
xv
),
f2
(
xv
))
def
test_local_gpu_dot_to_dot22dot
():
def
cmp
(
a_shp
,
b_shp
):
a0
=
numpy
.
random
.
rand
(
*
a_shp
)
.
astype
(
'float32'
)
a
=
cuda
.
shared_constructor
(
a0
,
'a'
)
b0
=
numpy
.
random
.
rand
(
*
b_shp
)
.
astype
(
'float32'
)
b
=
cuda
.
shared_constructor
(
b0
,
'a'
)
f
=
pfunc
([],
tensor
.
dot
(
a
,
b
),
mode
=
mode_with_gpu
)
assert
cuda
.
opt
.
local_gpu_dot_to_dot22
.
transform
(
tensor
.
dot
(
a
,
b
)
.
owner
)
out
=
f
()
assert
numpy
.
allclose
(
numpy
.
dot
(
a0
,
b0
),
out
)
# Try with a matrix equal to a0, but with strides in both dims
a
.
set_value
(
a0
)
a
.
set_value
(
a
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
-
1
],
borrow
=
True
)
f
()
cmp
((
4
,),
(
4
,
5
))
cmp
((
3
,
4
),
(
4
,))
class
test_diag
(
theano
.
tensor
.
tests
.
test_nlinalg
.
test_diag
):
class
test_diag
(
theano
.
tensor
.
tests
.
test_nlinalg
.
test_diag
):
mode
=
mode_with_gpu
mode
=
mode_with_gpu
shared
=
staticmethod
(
cuda
.
shared_constructor
)
shared
=
staticmethod
(
cuda
.
shared_constructor
)
...
...
theano/sandbox/gpuarray/opt.py
浏览文件 @
c022347b
...
@@ -44,9 +44,9 @@ gpu_cut_copies = EquilibriumDB()
...
@@ -44,9 +44,9 @@ gpu_cut_copies = EquilibriumDB()
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
'fast_
compile'
,
'fast_
run'
,
'inplace'
,
'gpuarray'
)
gpu_seqopt
.
register
(
'gpuarray_cut_transfers'
,
gpu_cut_copies
,
2
,
gpu_seqopt
.
register
(
'gpuarray_cut_transfers'
,
gpu_cut_copies
,
2
,
'fast_run'
,
'gpuarray'
)
'fast_
compile'
,
'fast_
run'
,
'gpuarray'
)
# do not add 'fast_run' to these two as this would always enable gpuarray mode
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb
.
register
(
'gpuarray_opt'
,
gpu_seqopt
,
optdb
.
register
(
'gpuarray_opt'
,
gpu_seqopt
,
...
@@ -61,7 +61,7 @@ def register_opt(*tags, **kwargs):
...
@@ -61,7 +61,7 @@ def register_opt(*tags, **kwargs):
return
local_opt
return
local_opt
return
f
return
f
register_opt
()(
theano
.
tensor
.
opt
.
local_track_shape_i
)
register_opt
(
'fast_compile'
)(
theano
.
tensor
.
opt
.
local_track_shape_i
)
def
safe_to_gpu
(
x
):
def
safe_to_gpu
(
x
):
...
@@ -145,19 +145,20 @@ def local_cut_gpu_host_gpu(node):
...
@@ -145,19 +145,20 @@ def local_cut_gpu_host_gpu(node):
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
return
False
return
False
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_host_gpu
,
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_host_gpu
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
'fast_
compile'
,
'fast_
run'
,
'inplace'
,
'gpuarray'
)
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
tensor
.
opt
.
constant_folding
,
tensor
.
opt
.
constant_folding
,
'fast_run'
,
'gpuarray'
)
'fast_
compile'
,
'fast_
run'
,
'gpuarray'
)
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
local_cut_gpu_host_gpu
,
'fast_run'
,
'gpuarray'
)
local_cut_gpu_host_gpu
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@local_optimizer
([
tensor
.
Alloc
])
@local_optimizer
([
tensor
.
Alloc
])
def
local_gpuaalloc2
(
node
):
def
local_gpuaalloc2
(
node
):
"""
"""
Join(axis,
Alloc, Alloc
, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Join(axis,
{Alloc or HostFromGPU}
, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Moves an alloc that is an input to join to the gpu.
Moves an alloc that is an input to join to the gpu.
"""
"""
...
@@ -171,7 +172,7 @@ def local_gpuaalloc2(node):
...
@@ -171,7 +172,7 @@ def local_gpuaalloc2(node):
return
[
host_from_gpu
(
gpu_alloc
(
*
node
.
inputs
))]
return
[
host_from_gpu
(
gpu_alloc
(
*
node
.
inputs
))]
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Alloc
])
@op_lifter
([
tensor
.
Alloc
])
def
local_gpuaalloc
(
node
):
def
local_gpuaalloc
(
node
):
new_out
=
gpu_alloc
(
*
node
.
inputs
)
new_out
=
gpu_alloc
(
*
node
.
inputs
)
...
@@ -199,7 +200,7 @@ def local_gpualloc_memset_0(node):
...
@@ -199,7 +200,7 @@ def local_gpualloc_memset_0(node):
return
[
new_out
]
return
[
new_out
]
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Reshape
])
@op_lifter
([
tensor
.
Reshape
])
def
local_gpureshape
(
node
):
def
local_gpureshape
(
node
):
op
=
node
.
op
op
=
node
.
op
...
@@ -210,14 +211,14 @@ def local_gpureshape(node):
...
@@ -210,14 +211,14 @@ def local_gpureshape(node):
return
res
return
res
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Rebroadcast
])
@op_lifter
([
tensor
.
Rebroadcast
])
def
local_gpu_rebroadcast
(
node
):
def
local_gpu_rebroadcast
(
node
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Flatten
])
@op_lifter
([
tensor
.
Flatten
])
def
local_gpuflatten
(
node
):
def
local_gpuflatten
(
node
):
op
=
node
.
op
op
=
node
.
op
...
@@ -230,7 +231,7 @@ def local_gpuflatten(node):
...
@@ -230,7 +231,7 @@ def local_gpuflatten(node):
return
o
return
o
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Elemwise
])
@op_lifter
([
tensor
.
Elemwise
])
def
local_gpu_elemwise
(
node
):
def
local_gpu_elemwise
(
node
):
op
=
node
.
op
op
=
node
.
op
...
@@ -273,14 +274,14 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
...
@@ -273,14 +274,14 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
'inplace_elemwise_optimizer'
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
'inplace_elemwise_optimizer'
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
DimShuffle
])
@op_lifter
([
tensor
.
DimShuffle
])
def
local_gpua_dimshuffle
(
node
):
def
local_gpua_dimshuffle
(
node
):
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
node
.
op
.
new_order
)
node
.
op
.
new_order
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
SpecifyShape
])
@op_lifter
([
tensor
.
SpecifyShape
])
def
local_gpua_specifyShape
(
node
):
def
local_gpua_specifyShape
(
node
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
...
@@ -289,11 +290,21 @@ def local_gpua_specifyShape(node):
...
@@ -289,11 +290,21 @@ def local_gpua_specifyShape(node):
return
tensor
.
specify_shape
(
*
inp
)
return
tensor
.
specify_shape
(
*
inp
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
def
local_gpua_shape
(
node
):
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
[
gpu_from_host
(
node
.
inputs
[
0
])
.
shape
]
def
gpu_print_wrapper
(
op
,
cnda
):
def
gpu_print_wrapper
(
op
,
cnda
):
op
.
old_op
.
global_fn
(
op
.
old_op
,
numpy
.
asarray
(
cnda
))
op
.
old_op
.
global_fn
(
op
.
old_op
,
numpy
.
asarray
(
cnda
))
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
printing
.
Print
])
@op_lifter
([
tensor
.
printing
.
Print
])
def
local_gpu_print_op
(
node
):
def
local_gpu_print_op
(
node
):
x
,
=
node
.
inputs
x
,
=
node
.
inputs
...
@@ -303,13 +314,13 @@ def local_gpu_print_op(node):
...
@@ -303,13 +314,13 @@ def local_gpu_print_op(node):
return
new_op
(
gpu_x
)
return
new_op
(
gpu_x
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Join
])
@op_lifter
([
tensor
.
Join
])
def
local_gpua_join
(
node
):
def
local_gpua_join
(
node
):
return
gpu_join
return
gpu_join
@register_opt
()
@register_opt
(
'fast_compile'
)
@local_optimizer
([
GpuJoin
])
@local_optimizer
([
GpuJoin
])
def
local_gpuajoin_1
(
node
):
def
local_gpuajoin_1
(
node
):
# join of a single element
# join of a single element
...
@@ -318,19 +329,19 @@ def local_gpuajoin_1(node):
...
@@ -318,19 +329,19 @@ def local_gpuajoin_1(node):
return
[
node
.
inputs
[
1
]]
return
[
node
.
inputs
[
1
]]
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Split
])
@op_lifter
([
tensor
.
Split
])
def
local_gpua_split
(
node
):
def
local_gpua_split
(
node
):
return
GpuSplit
(
node
.
op
.
len_splits
)
return
GpuSplit
(
node
.
op
.
len_splits
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Subtensor
])
@op_lifter
([
tensor
.
Subtensor
])
def
local_gpua_subtensor
(
node
):
def
local_gpua_subtensor
(
node
):
return
GpuSubtensor
(
node
.
op
.
idx_list
)
return
GpuSubtensor
(
node
.
op
.
idx_list
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
IncSubtensor
])
@op_lifter
([
tensor
.
IncSubtensor
])
def
local_gpua_incsubtensor
(
node
):
def
local_gpua_incsubtensor
(
node
):
return
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
return
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
...
@@ -338,7 +349,7 @@ def local_gpua_incsubtensor(node):
...
@@ -338,7 +349,7 @@ def local_gpua_incsubtensor(node):
node
.
op
.
destroyhandler_tolerate_aliased
)
node
.
op
.
destroyhandler_tolerate_aliased
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
def
local_gpua_advanced_incsubtensor
(
node
):
def
local_gpua_advanced_incsubtensor
(
node
):
...
@@ -362,7 +373,7 @@ def local_gpua_advanced_incsubtensor(node):
...
@@ -362,7 +373,7 @@ def local_gpua_advanced_incsubtensor(node):
set_instead_of_inc
=
set_instead_of_inc
)
set_instead_of_inc
=
set_instead_of_inc
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
def
local_gpua_careduce
(
node
):
def
local_gpua_careduce
(
node
):
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
...
@@ -442,71 +453,67 @@ def local_gpua_careduce(node):
...
@@ -442,71 +453,67 @@ def local_gpua_careduce(node):
return
[
unreshaped_reduce
]
return
[
unreshaped_reduce
]
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
def
local_gpua_gemv
(
node
):
def
local_gpua_gemv
(
node
):
return
GpuGemv
(
inplace
=
node
.
op
.
inplace
)
return
GpuGemv
(
inplace
=
node
.
op
.
inplace
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemm
])
@op_lifter
([
tensor
.
blas
.
Gemm
])
def
local_gpua_gemm
(
node
):
def
local_gpua_gemm
(
node
):
return
GpuGemm
(
inplace
=
node
.
op
.
inplace
)
return
GpuGemm
(
inplace
=
node
.
op
.
inplace
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
def
local_gpua_ger
(
node
):
def
local_gpua_ger
(
node
):
return
GpuGer
(
destructive
=
node
.
op
.
destructive
)
return
GpuGer
(
destructive
=
node
.
op
.
destructive
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Dot22
])
@op_lifter
([
tensor
.
blas
.
Dot22
])
def
local_gpua_dot22
(
node
):
def
local_gpua_dot22
(
node
):
return
gpu_dot22
return
gpu_dot22
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Eye
])
@op_lifter
([
tensor
.
basic
.
Eye
])
def
local_gpua_eye
(
node
):
def
local_gpua_eye
(
node
):
return
GpuEye
(
dtype
=
node
.
op
.
dtype
)
return
GpuEye
(
dtype
=
node
.
op
.
dtype
)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
):
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
):
return
GpuCrossentropySoftmaxArgmax1HotWithBias
()
return
GpuCrossentropySoftmaxArgmax1HotWithBias
()
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
])
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
])
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
):
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
):
return
GpuCrossentropySoftmax1HotWithBiasDx
()
return
GpuCrossentropySoftmax1HotWithBiasDx
()
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
Softmax
])
@op_lifter
([
tensor
.
nnet
.
Softmax
])
def
local_gpua_softmax
(
node
):
def
local_gpua_softmax
(
node
):
return
GpuSoftmax
()
return
GpuSoftmax
()
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
])
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
])
def
local_gpua_softmaxwithbias
(
node
):
def
local_gpua_softmaxwithbias
(
node
):
return
GpuSoftmaxWithBias
()
return
GpuSoftmaxWithBias
()
@register_opt
()
@register_opt
(
'fast_compile'
)
@
local_optimiz
er
([
theano
.
tensor
.
opt
.
Assert
])
@
op_lift
er
([
theano
.
tensor
.
opt
.
Assert
])
def
local_assert
(
node
):
def
local_assert
(
node
):
if
(
isinstance
(
node
.
op
,
theano
.
tensor
.
opt
.
Assert
)
and
return
[
host_from_gpu
(
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]))]
node
.
inputs
[
0
]
.
owner
and
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
)):
return
[
host_from_gpu
(
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]))]
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
gpu_from_host
,
ConvOp
])
@op_lifter
([
gpu_from_host
,
ConvOp
])
def
local_gpu_conv
(
node
):
def
local_gpu_conv
(
node
):
"""
"""
...
@@ -654,7 +661,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
...
@@ -654,7 +661,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
return
(
nw_inputs
,
nw_outputs
)
return
(
nw_inputs
,
nw_outputs
)
@register_opt
(
'scan'
)
@register_opt
(
'scan'
,
'fast_compile'
)
@op_lifter
([
scan_op
.
Scan
])
@op_lifter
([
scan_op
.
Scan
])
def
local_scan_to_gpua
(
node
):
def
local_scan_to_gpua
(
node
):
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
...
...
theano/sandbox/gpuarray/tests/test_opt.py
浏览文件 @
c022347b
...
@@ -4,7 +4,8 @@ import theano
...
@@ -4,7 +4,8 @@ import theano
from
theano
import
tensor
from
theano
import
tensor
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
import
theano.sandbox.gpuarray
import
theano.sandbox.gpuarray
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
from
theano.sandbox.gpuarray.basic_ops
import
(
from
theano.sandbox.gpuarray.basic_ops
import
(
GpuAlloc
,
GpuReshape
,
gpu_alloc
,
gpu_from_host
,
host_from_gpu
)
GpuAlloc
,
GpuReshape
,
gpu_alloc
,
gpu_from_host
,
host_from_gpu
)
from
theano.sandbox.gpuarray.elemwise
import
(
from
theano.sandbox.gpuarray.elemwise
import
(
...
...
theano/tensor/blas.py
浏览文件 @
c022347b
...
@@ -1815,13 +1815,14 @@ def local_dot22_to_ger_or_gemv(node):
...
@@ -1815,13 +1815,14 @@ def local_dot22_to_ger_or_gemv(node):
blas_optdb
=
SequenceDB
()
blas_optdb
=
SequenceDB
()
# run after numerical stability optimizations (1.5)
# run after numerical stability optimizations (1.5)
optdb
.
register
(
'BlasOpt'
,
blas_optdb
,
1.7
,
'fast_run'
)
optdb
.
register
(
'BlasOpt'
,
blas_optdb
,
1.7
,
'fast_run'
,
'fast_compile'
)
# run before specialize (2.0) because specialize is basically a
# run before specialize (2.0) because specialize is basically a
# free-for-all that makes the graph crazy.
# free-for-all that makes the graph crazy.
#fast_compile is needed to have GpuDot22 created.
blas_optdb
.
register
(
'local_dot_to_dot22'
,
blas_optdb
.
register
(
'local_dot_to_dot22'
,
in2out
(
local_dot_to_dot22
),
in2out
(
local_dot_to_dot22
),
0
,
'fast_run'
)
0
,
'fast_run'
,
'fast_compile'
)
blas_optdb
.
register
(
'gemm_optimizer'
,
blas_optdb
.
register
(
'gemm_optimizer'
,
GemmOptimizer
(),
GemmOptimizer
(),
10
,
'fast_run'
)
10
,
'fast_run'
)
...
...
theano/tensor/nnet/nnet.py
浏览文件 @
c022347b
"""Provides neural-network specific Ops.
"""Provides neural-network specific Ops.
:note: TODO: factor this out into a neural-network toolbox.
:note: TODO: factor this out into a neural-network toolbox.
:note: We register all optimization with the gpu tag as we don't
implement all the intermediate case on the GPU (in particular
AdvancedSubtensor). So to make sure it run well on the gpu with
fast_compile, we register them as needed for the GPU. This can be
revisited later when all the intermediate part are on the GPU.
"""
"""
import
logging
import
logging
import
numpy
import
numpy
...
@@ -570,7 +577,7 @@ class Softmax(gof.Op):
...
@@ -570,7 +577,7 @@ class Softmax(gof.Op):
softmax
=
Softmax
()
softmax
=
Softmax
()
@opt.register_specialize
@opt.register_specialize
(
'gpu'
)
@gof.local_optimizer
([
softmax
])
@gof.local_optimizer
([
softmax
])
def
local_softmax_with_bias
(
node
):
def
local_softmax_with_bias
(
node
):
"""Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
"""Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
...
@@ -1323,8 +1330,8 @@ class CrossentropyCategorical1Hot(gof.Op):
...
@@ -1323,8 +1330,8 @@ class CrossentropyCategorical1Hot(gof.Op):
crossentropy_categorical_1hot
=
CrossentropyCategorical1Hot
()
crossentropy_categorical_1hot
=
CrossentropyCategorical1Hot
()
@opt.register_stabilize
@opt.register_stabilize
(
'gpu'
)
@opt.register_specialize
@opt.register_specialize
(
'gpu'
)
@gof.optimizer
@gof.optimizer
def
crossentropy_to_crossentropy_with_softmax_with_bias
(
fgraph
):
def
crossentropy_to_crossentropy_with_softmax_with_bias
(
fgraph
):
"""This is a stabilization optimization
"""This is a stabilization optimization
...
@@ -1397,9 +1404,10 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
...
@@ -1397,9 +1404,10 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
optdb
.
register
(
'crossentropy_to_crossentropy_with_softmax'
,
optdb
.
register
(
'crossentropy_to_crossentropy_with_softmax'
,
crossentropy_to_crossentropy_with_softmax
,
2.01
,
crossentropy_to_crossentropy_with_softmax
,
2.01
,
'fast_run'
,
'xent'
)
'fast_run'
,
'xent'
,
'gpu'
)
@opt.register_specialize
(
'gpu'
)
@gof.local_optimizer
([
softmax_grad
])
@gof.local_optimizer
([
softmax_grad
])
def
local_crossentropy_to_crossentropy_with_softmax_grad
(
node
):
def
local_crossentropy_to_crossentropy_with_softmax_grad
(
node
):
if
node
.
op
==
softmax_grad
:
if
node
.
op
==
softmax_grad
:
...
@@ -1410,10 +1418,9 @@ def local_crossentropy_to_crossentropy_with_softmax_grad(node):
...
@@ -1410,10 +1418,9 @@ def local_crossentropy_to_crossentropy_with_softmax_grad(node):
dx
=
crossentropy_softmax_1hot_with_bias_dx
(
g_nll
,
dx
=
crossentropy_softmax_1hot_with_bias_dx
(
g_nll
,
coding_dist
,
true_one_of_n
)
coding_dist
,
true_one_of_n
)
return
[
dx
]
return
[
dx
]
opt
.
register_specialize
(
local_crossentropy_to_crossentropy_with_softmax_grad
)
@opt.register_specialize
@opt.register_specialize
(
'gpu'
)
@gof.local_optimizer
([
tensor
.
_max_and_argmax
])
@gof.local_optimizer
([
tensor
.
_max_and_argmax
])
def
local_argmax_pushdown
(
node
):
def
local_argmax_pushdown
(
node
):
if
node
.
op
==
tensor
.
_max_and_argmax
and
node
.
inputs
[
0
]
.
owner
and
\
if
node
.
op
==
tensor
.
_max_and_argmax
and
node
.
inputs
[
0
]
.
owner
and
\
...
@@ -1499,7 +1506,7 @@ def _is_const(z, val, approx=False):
...
@@ -1499,7 +1506,7 @@ def _is_const(z, val, approx=False):
return
numpy
.
all
(
maybe
==
val
)
return
numpy
.
all
(
maybe
==
val
)
@opt.register_specialize
@opt.register_specialize
(
'gpu'
)
@gof.local_optimizer
([
subtensor
.
AdvancedSubtensor
,
tensor
.
log
])
@gof.local_optimizer
([
subtensor
.
AdvancedSubtensor
,
tensor
.
log
])
def
local_advanced_indexing_crossentropy_onehot
(
node
):
def
local_advanced_indexing_crossentropy_onehot
(
node
):
log
=
None
log
=
None
...
@@ -1540,7 +1547,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
...
@@ -1540,7 +1547,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
labels
)[
0
]]
labels
)[
0
]]
@opt.register_specialize
@opt.register_specialize
(
'gpu'
)
@gof.local_optimizer
([
softmax_grad
])
@gof.local_optimizer
([
softmax_grad
])
def
local_advanced_indexing_crossentropy_onehot_grad
(
node
):
def
local_advanced_indexing_crossentropy_onehot_grad
(
node
):
if
not
(
node
.
op
==
softmax_grad
):
if
not
(
node
.
op
==
softmax_grad
):
...
@@ -1763,7 +1770,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
...
@@ -1763,7 +1770,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
return
return
@opt.register_specialize
@opt.register_specialize
(
'gpu'
)
@gof.local_optimizer
([
softmax_with_bias
])
@gof.local_optimizer
([
softmax_with_bias
])
def
graph_merge_softmax_with_crossentropy_softmax
(
node
):
def
graph_merge_softmax_with_crossentropy_softmax
(
node
):
if
node
.
op
==
softmax_with_bias
:
if
node
.
op
==
softmax_with_bias
:
...
@@ -1963,10 +1970,10 @@ def make_out_pattern(X):
...
@@ -1963,10 +1970,10 @@ def make_out_pattern(X):
local_log_softmax
=
gof
.
PatternSub
(
in_pattern
=
(
tensor
.
log
,
(
softmax
,
'x'
)),
local_log_softmax
=
gof
.
PatternSub
(
in_pattern
=
(
tensor
.
log
,
(
softmax
,
'x'
)),
out_pattern
=
(
make_out_pattern
,
'x'
),
out_pattern
=
(
make_out_pattern
,
'x'
),
allow_multiple_clients
=
True
)
allow_multiple_clients
=
True
)
#don't do register_stabilize, this is to make local_log_softmax run
#don't do register_stabilize, this is to make local_log_softmax run
#only after another more specific optimization that stabilizes cross entropy
#only after another more specific optimization that stabilizes cross entropy
#opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
#opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
opt
.
register_specialize
(
local_log_softmax
,
name
=
'local_log_softmax'
)
opt
.
register_specialize
(
local_log_softmax
,
'gpu'
,
name
=
'local_log_softmax'
)
theano/tensor/opt.py
浏览文件 @
c022347b
...
@@ -310,21 +310,36 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
...
@@ -310,21 +310,36 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
def
register_canonicalize
(
lopt
,
*
tags
,
**
kwargs
):
def
register_canonicalize
(
lopt
,
*
tags
,
**
kwargs
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
lopt
.
__name__
if
type
(
lopt
)
==
str
:
compile
.
optdb
[
'canonicalize'
]
.
register
(
name
,
lopt
,
'fast_run'
,
*
tags
)
def
register
(
inner_lopt
):
return
lopt
return
register_canonicalize
(
inner_lopt
,
*
tags
,
**
kwargs
)
return
register
else
:
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
lopt
.
__name__
compile
.
optdb
[
'canonicalize'
]
.
register
(
name
,
lopt
,
'fast_run'
,
*
tags
)
return
lopt
def
register_stabilize
(
lopt
,
*
tags
,
**
kwargs
):
def
register_stabilize
(
lopt
,
*
tags
,
**
kwargs
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
lopt
.
__name__
if
type
(
lopt
)
==
str
:
compile
.
optdb
[
'stabilize'
]
.
register
(
name
,
lopt
,
'fast_run'
,
*
tags
)
def
register
(
inner_lopt
):
return
lopt
return
register_stabilize
(
inner_lopt
,
*
tags
,
**
kwargs
)
return
register
else
:
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
lopt
.
__name__
compile
.
optdb
[
'stabilize'
]
.
register
(
name
,
lopt
,
'fast_run'
,
*
tags
)
return
lopt
def
register_specialize
(
lopt
,
*
tags
,
**
kwargs
):
def
register_specialize
(
lopt
,
*
tags
,
**
kwargs
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
lopt
.
__name__
if
type
(
lopt
)
==
str
:
compile
.
optdb
[
'specialize'
]
.
register
(
name
,
lopt
,
'fast_run'
,
*
tags
)
def
register
(
inner_lopt
):
return
lopt
return
register_specialize
(
inner_lopt
,
*
tags
,
**
kwargs
)
return
register
else
:
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
lopt
.
__name__
compile
.
optdb
[
'specialize'
]
.
register
(
name
,
lopt
,
'fast_run'
,
*
tags
)
return
lopt
def
register_uncanonicalize
(
lopt
,
*
tags
,
**
kwargs
):
def
register_uncanonicalize
(
lopt
,
*
tags
,
**
kwargs
):
...
@@ -1304,7 +1319,7 @@ def local_track_shape_i(node):
...
@@ -1304,7 +1319,7 @@ def local_track_shape_i(node):
@register_specialize
@register_specialize
@register_canonicalize
@register_canonicalize
(
'gpu'
)
@gof.local_optimizer
([
Subtensor
])
@gof.local_optimizer
([
Subtensor
])
def
local_subtensor_make_vector
(
node
):
def
local_subtensor_make_vector
(
node
):
# replace all subtensor(make_vector) like:
# replace all subtensor(make_vector) like:
...
@@ -1354,8 +1369,7 @@ def local_subtensor_make_vector(node):
...
@@ -1354,8 +1369,7 @@ def local_subtensor_make_vector(node):
#TODO: the other optimization for and, or, xor, le and ge see ticket #496.
#TODO: the other optimization for and, or, xor, le and ge see ticket #496.
@register_canonicalize
(
'fast_compile'
)
@register_canonicalize
@register_specialize
@register_specialize
@gof.local_optimizer
([
T
.
Elemwise
])
@gof.local_optimizer
([
T
.
Elemwise
])
def
local_useless_elemwise
(
node
):
def
local_useless_elemwise
(
node
):
...
@@ -3508,7 +3522,7 @@ def local_reduce_join(node):
...
@@ -3508,7 +3522,7 @@ def local_reduce_join(node):
#else the reduction do something about the dtype.
#else the reduction do something about the dtype.
@register_canonicalize
@register_canonicalize
(
'fast_compile'
)
@gof.local_optimizer
(
ALL_REDUCE
)
@gof.local_optimizer
(
ALL_REDUCE
)
def
local_cut_useless_reduce
(
node
):
def
local_cut_useless_reduce
(
node
):
"""Sum(a, axis=[]) -> a """
"""Sum(a, axis=[]) -> a """
...
@@ -4152,6 +4166,8 @@ def attempt_distribution(factor, num, denum, out_type):
...
@@ -4152,6 +4166,8 @@ def attempt_distribution(factor, num, denum, out_type):
neg_pairs
))),
num
,
denum
neg_pairs
))),
num
,
denum
@register_canonicalize
@register_stabilize
@gof.local_optimizer
([
T
.
mul
,
T
.
true_div
,
T
.
inv
])
@gof.local_optimizer
([
T
.
mul
,
T
.
true_div
,
T
.
inv
])
def
local_greedy_distributor
(
node
):
def
local_greedy_distributor
(
node
):
"""
"""
...
@@ -4216,10 +4232,10 @@ def local_greedy_distributor(node):
...
@@ -4216,10 +4232,10 @@ def local_greedy_distributor(node):
return
[
rval
]
return
[
rval
]
register_canonicalize
(
local_greedy_distributor
)
register_stabilize
(
local_greedy_distributor
)
@register_canonicalize
(
'fast_compile'
)
@register_stabilize
(
'fast_compile'
)
@register_specialize
(
'fast_compile'
)
@gof.local_optimizer
(
None
)
@gof.local_optimizer
(
None
)
def
constant_folding
(
node
):
def
constant_folding
(
node
):
for
input
in
node
.
inputs
:
for
input
in
node
.
inputs
:
...
@@ -4253,10 +4269,6 @@ def constant_folding(node):
...
@@ -4253,10 +4269,6 @@ def constant_folding(node):
rval
.
append
(
constant
(
output
.
type
,
storage_map
[
output
][
0
]))
rval
.
append
(
constant
(
output
.
type
,
storage_map
[
output
][
0
]))
return
rval
return
rval
register_canonicalize
(
constant_folding
,
'fast_compile'
)
register_stabilize
(
constant_folding
,
'fast_compile'
)
register_specialize
(
constant_folding
,
'fast_compile'
)
def
_is_1
(
expr
):
def
_is_1
(
expr
):
"""rtype bool. True iff expr is a constant close to 1
"""rtype bool. True iff expr is a constant close to 1
...
@@ -5145,4 +5157,4 @@ else:
...
@@ -5145,4 +5157,4 @@ else:
# Although the op just returns its input, it should be removed from
# Although the op just returns its input, it should be removed from
# the graph to make sure all possible optimizations can be applied.
# the graph to make sure all possible optimizations can be applied.
register_canonicalize
(
gof
.
OpRemove
(
theano
.
gradient
.
consider_constant_
),
register_canonicalize
(
gof
.
OpRemove
(
theano
.
gradient
.
consider_constant_
),
'fast_compile'
,
name
=
'remove_consider_constant'
)
'fast_compile'
,
'fast_run'
,
name
=
'remove_consider_constant'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论