Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
e45b6cd6
提交
e45b6cd6
authored
6月 02, 2016
作者:
Frederic Bastien
提交者:
sentient07
7月 08, 2016
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Advance GraphToGPU. Make register_opt2 to register to it to bypass op_lifter
上级
d9d4fc8b
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
132 行增加
和
66 行删除
+132
-66
opt.py
theano/gpuarray/opt.py
+132
-66
没有找到文件。
theano/gpuarray/opt.py
浏览文件 @
e45b6cd6
...
@@ -55,6 +55,7 @@ _logger = logging.getLogger("theano.gpuarray.opt")
...
@@ -55,6 +55,7 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer
=
EquilibriumDB
()
gpu_optimizer
=
EquilibriumDB
()
gpu_optimizer2
=
EquilibriumDB
()
gpu_cut_copies
=
EquilibriumDB
()
gpu_cut_copies
=
EquilibriumDB
()
...
@@ -66,19 +67,17 @@ class GraphToGPUDB(DB):
...
@@ -66,19 +67,17 @@ class GraphToGPUDB(DB):
"""
"""
def
query
(
self
,
*
tags
,
**
kwtags
):
def
query
(
self
,
*
tags
,
**
kwtags
):
opt
=
gpu_optimizer
.
query
(
*
tags
,
**
kwtags
)
opt
=
gpu_optimizer
2
.
query
(
*
tags
,
**
kwtags
)
return
GraphToGPU
(
opt
.
local_optimizers_all
,
opt
.
local_optimizers_map
)
return
GraphToGPU
(
opt
.
local_optimizers_all
,
opt
.
local_optimizers_map
)
graph_optimizer
=
GraphToGPUDB
()
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
=
SequenceDB
()
# Don't register this right now
# Don't register this right now
conv_groupopt
=
LocalGroupDB
()
conv_groupopt
=
LocalGroupDB
()
conv_groupopt
.
__name__
=
"gpua_conv_opts"
conv_groupopt
.
__name__
=
"gpua_conv_opts"
gpu_seqopt
.
register
(
'gpu
_graph_optimization'
,
graph_optimizer
,
-
0.5
,
gpu_seqopt
.
register
(
'gpu
array_graph_optimization'
,
GraphToGPUDB
()
,
-
0.5
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
...
@@ -100,6 +99,15 @@ def register_opt(*tags, **kwargs):
...
@@ -100,6 +99,15 @@ def register_opt(*tags, **kwargs):
return
f
return
f
def
register_opt2
(
tracks
,
*
tags
,
**
kwargs
):
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
opt
=
theano
.
gof
.
local_optimizer
(
tracks
)(
local_opt
)
gpu_optimizer2
.
register
(
name
,
opt
,
'fast_run'
,
'gpuarray'
,
*
tags
)
return
local_opt
return
f
def
register_inplace
(
*
tags
,
**
kwargs
):
def
register_inplace
(
*
tags
,
**
kwargs
):
def
f
(
local_opt
):
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
...
@@ -176,7 +184,10 @@ def op_lifter(OP, cuda_only=False):
...
@@ -176,7 +184,10 @@ def op_lifter(OP, cuda_only=False):
# the context was derived from the outputs
# the context was derived from the outputs
for
i
in
node
.
inputs
:
for
i
in
node
.
inputs
:
i
.
tag
.
context_name
=
context_name
i
.
tag
.
context_name
=
context_name
new_op
=
maker
(
node
,
context_name
)
try
:
new_op
=
maker
(
node
,
context_name
,
node
.
inputs
)
except
TypeError
:
new_op
=
maker
(
node
,
context_name
)
# This is needed as sometimes new_op inherits from OP.
# This is needed as sometimes new_op inherits from OP.
if
new_op
and
new_op
!=
node
.
op
:
if
new_op
and
new_op
!=
node
.
op
:
if
isinstance
(
new_op
,
theano
.
Op
):
if
isinstance
(
new_op
,
theano
.
Op
):
...
@@ -253,7 +264,7 @@ class GraphToGPU(Optimizer):
...
@@ -253,7 +264,7 @@ class GraphToGPU(Optimizer):
# Iterating through inputs of graph
# Iterating through inputs of graph
for
i
in
fgraph
.
inputs
:
for
i
in
fgraph
.
inputs
:
if
isinstance
(
i
.
type
,
tensor
.
TensorType
):
if
isinstance
(
i
.
type
,
tensor
.
TensorType
):
mapping
[
i
]
=
GpuFromHost
(
None
)(
i
)
mapping
[
i
]
=
as_gpuarray_variable
(
i
,
None
)
# TODO context
else
:
else
:
mapping
[
i
]
=
i
mapping
[
i
]
=
i
for
i
in
fgraph
.
variables
:
for
i
in
fgraph
.
variables
:
...
@@ -262,39 +273,60 @@ class GraphToGPU(Optimizer):
...
@@ -262,39 +273,60 @@ class GraphToGPU(Optimizer):
for
node
in
fgraph
.
toposort
():
for
node
in
fgraph
.
toposort
():
# The Extra condition
if
isinstance
(
node
.
op
,
HostFromGpu
):
if
any
([
isinstance
(
i
,
GpuArrayVariable
)
or
mapping
[
node
.
outputs
[
0
]]
=
node
.
inputs
[
0
]
isinstance
(
i
,
GpuArraySharedVariable
)
continue
for
i
in
node
.
inputs
+
node
.
outputs
]):
move_to_GPU
=
False
# Move only if any of the inputs are on the GPU.
move_to_GPU
=
False
if
any
([
isinstance
(
i
,
GpuArrayVariable
)
or
isinstance
(
i
,
GpuArraySharedVariable
)
for
i
in
[
mapping
[
v
]
for
v
in
node
.
inputs
]
+
node
.
outputs
]):
# Oplifter's condition
move_to_GPU
=
True
# Will return a list of OP
# If None, means can't be moved.
new_ops
=
None
new_ops
=
None
# Apply the lifter
# Selecting the best optimizer
# TODO : the tag should be updated to the one user provides
# currently using fast_run and fast_compile tag
for
lopt
in
(
self
.
local_optimizers_all
+
for
lopt
in
(
self
.
local_optimizers_all
+
self
.
local_optimizers_map
.
get
(
type
(
node
.
op
),
[])
+
self
.
local_optimizers_map
.
get
(
type
(
node
.
op
),
[])
+
self
.
local_optimizers_map
.
get
(
node
.
op
,
[])):
self
.
local_optimizers_map
.
get
(
node
.
op
,
[])):
replace
=
False
new_ops
=
lopt
.
transform
(
node
)
or
lopt
(
node
)
for
i
in
[
mapping
[
i
]
for
i
in
node
.
inputs
]:
break
if
isinstance
(
i
.
type
,
GpuArrayType
):
context_name
=
i
.
type
.
context_name
if
not
new_ops
or
not
isinstance
(
new_ops
,
theano
.
Op
):
replace
=
True
move_to_GPU
=
False
break
if
replace
:
if
move_to_GPU
:
try
:
newnode
=
new_ops
(
*
[
mapping
.
get
(
i
)
for
i
in
node
.
inputs
])
new_ops
=
lopt
.
transform
(
for
new_o
,
old_o
in
zip
(
newnode
.
outputs
,
node
.
outputs
):
node
,
context_name
,
mapping
[
old_o
]
=
new_o
[
mapping
[
i
]
for
i
in
node
.
inputs
])
except
TypeError
:
new_ops
=
lopt
.
transform
(
node
,
context_name
)
if
new_ops
:
break
if
not
new_ops
:
newnode
=
node
.
clone_with_new_inputs
([
mapping
.
get
(
i
)
for
i
in
node
.
inputs
])
outputs
=
newnode
.
outputs
elif
isinstance
(
new_ops
,
(
tuple
,
list
)):
outputs
=
[]
for
o
in
new_ops
:
if
o
.
owner
and
isinstance
(
o
.
owner
.
op
,
HostFromGpu
):
outputs
.
append
(
o
.
owner
.
inputs
[
0
])
else
:
outputs
.
append
(
o
)
elif
isinstance
(
new_ops
,
theano
.
Variable
):
if
new_ops
.
owner
and
isinstance
(
new_ops
.
owner
.
op
,
HostFromGpu
):
outputs
=
new_ops
.
owner
.
inputs
else
:
outputs
=
[
new_ops
]
else
:
else
:
for
o
in
node
.
outputs
:
outputs
=
new_ops
(
*
[
mapping
[
i
]
for
i
in
node
.
inputs
],
mapping
[
o
]
=
o
return_list
=
True
)
for
new_o
,
old_o
in
zip
(
outputs
,
node
.
outputs
):
mapping
[
old_o
]
=
new_o
new_nodes
=
[]
new_nodes
=
[]
for
o
in
fgraph
.
outputs
:
for
o
in
fgraph
.
outputs
:
...
@@ -389,12 +421,14 @@ def local_gpuaalloc2(node):
...
@@ -389,12 +421,14 @@ def local_gpuaalloc2(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Alloc
])
@op_lifter
([
tensor
.
Alloc
])
@register_opt2
([
tensor
.
Alloc
],
'fast_compile'
)
def
local_gpuaalloc
(
node
,
context_name
):
def
local_gpuaalloc
(
node
,
context_name
):
return
GpuAlloc
(
context_name
)(
*
node
.
inputs
)
return
GpuAlloc
(
context_name
)(
*
node
.
inputs
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AllocEmpty
])
@op_lifter
([
tensor
.
AllocEmpty
])
@register_opt2
([
tensor
.
AllocEmpty
],
'fast_compile'
)
def
local_gpuaallocempty
(
node
,
context_name
):
def
local_gpuaallocempty
(
node
,
context_name
):
# We use _props_dict() to make sure that the GPU op know all the
# We use _props_dict() to make sure that the GPU op know all the
# CPU op props.
# CPU op props.
...
@@ -444,12 +478,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
...
@@ -444,12 +478,14 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
extra_ops
.
CpuContiguous
])
@op_lifter
([
tensor
.
extra_ops
.
CpuContiguous
])
@register_opt2
([
tensor
.
extra_ops
.
CpuContiguous
],
'fast_compile'
)
def
local_gpu_contiguous
(
node
,
context_name
):
def
local_gpu_contiguous
(
node
,
context_name
):
return
gpu_contiguous
return
gpu_contiguous
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Reshape
])
@op_lifter
([
tensor
.
Reshape
])
@register_opt2
([
tensor
.
Reshape
],
'fast_compile'
)
def
local_gpureshape
(
node
,
context_name
):
def
local_gpureshape
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
name
=
op
.
name
name
=
op
.
name
...
@@ -461,26 +497,29 @@ def local_gpureshape(node, context_name):
...
@@ -461,26 +497,29 @@ def local_gpureshape(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Rebroadcast
])
@op_lifter
([
tensor
.
Rebroadcast
])
def
local_gpu_rebroadcast
(
node
,
context_name
):
@register_opt2
([
tensor
.
Rebroadcast
],
'fast_compile'
)
return
node
.
op
(
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
))
def
local_gpu_rebroadcast
(
node
,
context_name
,
inputs
):
return
node
.
op
(
as_gpuarray_variable
(
inputs
[
0
],
context_name
))
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Flatten
])
@op_lifter
([
tensor
.
Flatten
])
def
local_gpuflatten
(
node
,
context_name
):
@register_opt2
([
tensor
.
Flatten
],
'fast_compile'
)
def
local_gpuflatten
(
node
,
context_name
,
inputs
):
op
=
node
.
op
op
=
node
.
op
shp
=
[]
shp
=
[]
if
op
.
outdim
!=
1
:
if
op
.
outdim
!=
1
:
shp
=
[
node
.
inputs
[
0
]
.
shape
[
i
]
for
i
in
range
(
op
.
outdim
-
1
)]
shp
=
[
inputs
[
0
]
.
shape
[
i
]
for
i
in
range
(
op
.
outdim
-
1
)]
shp
+=
[
-
1
]
shp
+=
[
-
1
]
res
=
GpuReshape
(
op
.
outdim
,
None
)
res
=
GpuReshape
(
op
.
outdim
,
None
)
o
=
res
(
node
.
inputs
[
0
],
theano
.
tensor
.
as_tensor_variable
(
shp
))
o
=
res
(
inputs
[
0
],
theano
.
tensor
.
as_tensor_variable
(
shp
))
return
o
return
o
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Elemwise
])
@op_lifter
([
tensor
.
Elemwise
])
def
local_gpu_elemwise
(
node
,
context_name
):
@register_opt2
([
tensor
.
Elemwise
],
'fast_compile'
)
def
local_gpu_elemwise
(
node
,
context_name
,
inputs
):
op
=
node
.
op
op
=
node
.
op
scal_op
=
op
.
scalar_op
scal_op
=
op
.
scalar_op
name
=
op
.
name
name
=
op
.
name
...
@@ -505,7 +544,7 @@ def local_gpu_elemwise(node, context_name):
...
@@ -505,7 +544,7 @@ def local_gpu_elemwise(node, context_name):
# Transfer the inputs on the GPU and cast them to the right dtype.
# Transfer the inputs on the GPU and cast them to the right dtype.
new_inputs
=
[]
new_inputs
=
[]
for
inp
in
node
.
inputs
:
for
inp
in
inputs
:
if
inp
.
dtype
!=
out_dtype
:
if
inp
.
dtype
!=
out_dtype
:
gpu_cast_op
=
GpuElemwise
(
Cast
(
Scalar
(
out_dtype
)))
gpu_cast_op
=
GpuElemwise
(
Cast
(
Scalar
(
out_dtype
)))
new_inputs
.
append
(
gpu_cast_op
(
as_gpuarray_variable
(
inp
,
context_name
)))
new_inputs
.
append
(
gpu_cast_op
(
as_gpuarray_variable
(
inp
,
context_name
)))
...
@@ -553,6 +592,7 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
...
@@ -553,6 +592,7 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
DimShuffle
])
@op_lifter
([
tensor
.
DimShuffle
])
@register_opt2
([
tensor
.
DimShuffle
],
'fast_compile'
)
def
local_gpua_dimshuffle
(
node
,
context_name
):
def
local_gpua_dimshuffle
(
node
,
context_name
):
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
node
.
op
.
new_order
)
node
.
op
.
new_order
)
...
@@ -560,22 +600,24 @@ def local_gpua_dimshuffle(node, context_name):
...
@@ -560,22 +600,24 @@ def local_gpua_dimshuffle(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
SpecifyShape
])
@op_lifter
([
tensor
.
SpecifyShape
])
def
local_gpua_specifyShape
(
node
,
context_name
):
@register_opt2
([
tensor
.
SpecifyShape
],
'fast_compile'
)
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
def
local_gpua_specifyShape
(
node
,
context_name
,
inputs
):
if
isinstance
(
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
inp
=
[
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
)]
inp
=
[
as_gpuarray_variable
(
inputs
[
0
],
context_name
)]
inp
+=
node
.
inputs
[
1
:]
inp
+=
inputs
[
1
:]
return
tensor
.
specify_shape
(
*
inp
)
return
tensor
.
specify_shape
(
*
inp
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
def
local_gpua_shape
(
node
,
context_name
):
@register_opt2
([
tensor
.
compile
.
ops
.
Shape
],
'fast_compile'
)
def
local_gpua_shape
(
node
,
context_name
,
inputs
):
# op_lifter will call this opt too frequently as the output is
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
# always on the CPU.
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
return
[
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
)
.
shape
]
return
[
as_gpuarray_variable
(
inputs
[
0
],
context_name
)
.
shape
]
def
gpu_print_wrapper
(
op
,
cnda
):
def
gpu_print_wrapper
(
op
,
cnda
):
...
@@ -584,8 +626,9 @@ def gpu_print_wrapper(op, cnda):
...
@@ -584,8 +626,9 @@ def gpu_print_wrapper(op, cnda):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
printing
.
Print
])
@op_lifter
([
tensor
.
printing
.
Print
])
def
local_gpu_print_op
(
node
,
context_name
):
@register_opt2
([
tensor
.
printing
.
Print
],
'fast_compile'
)
x
,
=
node
.
inputs
def
local_gpu_print_op
(
node
,
context_name
,
inputs
):
x
,
=
inputs
gpu_x
=
as_gpuarray_variable
(
x
,
context_name
=
context_name
)
gpu_x
=
as_gpuarray_variable
(
x
,
context_name
=
context_name
)
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
new_op
.
old_op
=
node
.
op
new_op
.
old_op
=
node
.
op
...
@@ -662,12 +705,13 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -662,12 +705,13 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
IfElse
])
@op_lifter
([
IfElse
])
def
local_gpua_lazy_ifelse
(
node
,
context_name
):
@register_opt2
([
IfElse
],
'fast_compile'
)
def
local_gpua_lazy_ifelse
(
node
,
context_name
,
inputs
):
if
node
.
op
.
gpu
:
if
node
.
op
.
gpu
:
return
return
c
=
node
.
inputs
[
0
]
c
=
inputs
[
0
]
inps
=
[]
inps
=
[]
for
v
in
node
.
inputs
[
1
:]:
for
v
in
inputs
[
1
:]:
if
isinstance
(
v
.
type
,
(
tensor
.
TensorType
,
GpuArrayType
)):
if
isinstance
(
v
.
type
,
(
tensor
.
TensorType
,
GpuArrayType
)):
inps
.
append
(
as_gpuarray_variable
(
v
,
context_name
))
inps
.
append
(
as_gpuarray_variable
(
v
,
context_name
))
else
:
else
:
...
@@ -677,6 +721,7 @@ def local_gpua_lazy_ifelse(node, context_name):
...
@@ -677,6 +721,7 @@ def local_gpua_lazy_ifelse(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Join
])
@op_lifter
([
tensor
.
Join
])
@register_opt2
([
tensor
.
Join
],
'fast_compile'
)
def
local_gpua_join
(
node
,
context_name
):
def
local_gpua_join
(
node
,
context_name
):
return
gpu_join
return
gpu_join
...
@@ -692,12 +737,15 @@ def local_gpuajoin_1(node):
...
@@ -692,12 +737,15 @@ def local_gpuajoin_1(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Split
])
@op_lifter
([
tensor
.
Split
])
@register_opt2
([
tensor
.
Split
],
'fast_compile'
)
def
local_gpua_split
(
node
,
context_name
):
def
local_gpua_split
(
node
,
context_name
):
#TODO use props
return
GpuSplit
(
node
.
op
.
len_splits
)
return
GpuSplit
(
node
.
op
.
len_splits
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Subtensor
])
@op_lifter
([
tensor
.
Subtensor
])
@register_opt2
([
tensor
.
Subtensor
],
'fast_compile'
)
def
local_gpua_subtensor
(
node
,
context_name
):
def
local_gpua_subtensor
(
node
,
context_name
):
x
=
node
.
inputs
[
0
]
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
...
@@ -719,11 +767,12 @@ def local_gpua_subtensor(node, context_name):
...
@@ -719,11 +767,12 @@ def local_gpua_subtensor(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
IncSubtensor
])
@op_lifter
([
tensor
.
IncSubtensor
])
def
local_gpua_incsubtensor
(
node
,
context_name
):
@register_opt2
([
tensor
.
IncSubtensor
],
'fast_compile'
)
def
local_gpua_incsubtensor
(
node
,
context_name
,
inputs
):
op
=
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
op
=
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
node
.
op
.
set_instead_of_inc
,
node
.
op
.
set_instead_of_inc
,
node
.
op
.
destroyhandler_tolerate_aliased
)
node
.
op
.
destroyhandler_tolerate_aliased
)
ret
=
op
(
*
node
.
inputs
)
ret
=
op
(
*
inputs
)
val
=
getattr
(
node
.
outputs
[
0
]
.
tag
,
'nan_guard_mode_check'
,
True
)
val
=
getattr
(
node
.
outputs
[
0
]
.
tag
,
'nan_guard_mode_check'
,
True
)
ret
.
tag
.
nan_guard_mode_check
=
val
ret
.
tag
.
nan_guard_mode_check
=
val
return
ret
return
ret
...
@@ -731,12 +780,14 @@ def local_gpua_incsubtensor(node, context_name):
...
@@ -731,12 +780,14 @@ def local_gpua_incsubtensor(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
@register_opt2
([
tensor
.
AdvancedSubtensor1
],
'fast_compile'
)
def
local_gpua_advanced_subtensor
(
node
,
context_name
):
def
local_gpua_advanced_subtensor
(
node
,
context_name
):
return
GpuAdvancedSubtensor1
()
return
GpuAdvancedSubtensor1
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
@register_opt2
([
tensor
.
AdvancedIncSubtensor1
],
'fast_compile'
)
def
local_gpua_advanced_incsubtensor
(
node
,
context_name
):
def
local_gpua_advanced_incsubtensor
(
node
,
context_name
):
context
=
get_context
(
context_name
)
context
=
get_context
(
context_name
)
# This is disabled on non-cuda contexts
# This is disabled on non-cuda contexts
...
@@ -776,6 +827,7 @@ def local_advincsub1_gpua_inplace(node):
...
@@ -776,6 +827,7 @@ def local_advincsub1_gpua_inplace(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
@register_opt2
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
],
'fast_compile'
)
def
local_gpua_careduce
(
node
,
context_name
):
def
local_gpua_careduce
(
node
,
context_name
):
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
scalar
.
Maximum
,
scalar
.
Minimum
)):
scalar
.
Maximum
,
scalar
.
Minimum
)):
...
@@ -859,6 +911,7 @@ def local_gpua_careduce(node, context_name):
...
@@ -859,6 +911,7 @@ def local_gpua_careduce(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
@register_opt2
([
tensor
.
blas
.
Gemv
],
'fast_compile'
)
def
local_gpua_gemv
(
node
,
context_name
):
def
local_gpua_gemv
(
node
,
context_name
):
if
node
.
op
.
inplace
:
if
node
.
op
.
inplace
:
return
gpugemv_inplace
return
gpugemv_inplace
...
@@ -868,6 +921,7 @@ def local_gpua_gemv(node, context_name):
...
@@ -868,6 +921,7 @@ def local_gpua_gemv(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemm
])
@op_lifter
([
tensor
.
blas
.
Gemm
])
@register_opt2
([
tensor
.
blas
.
Gemm
],
'fast_compile'
)
def
local_gpua_gemm
(
node
,
context_name
):
def
local_gpua_gemm
(
node
,
context_name
):
if
node
.
op
.
inplace
:
if
node
.
op
.
inplace
:
return
gpugemm_inplace
return
gpugemm_inplace
...
@@ -877,26 +931,28 @@ def local_gpua_gemm(node, context_name):
...
@@ -877,26 +931,28 @@ def local_gpua_gemm(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
BatchedDot
])
@op_lifter
([
tensor
.
blas
.
BatchedDot
])
def
local_gpua_gemmbatch
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
BatchedDot
],
'fast_compile'
)
a
,
b
=
node
.
inputs
def
local_gpua_gemmbatch
(
node
,
context_name
,
inputs
):
a
,
b
=
inputs
c
=
tensor
.
AllocEmpty
(
a
.
dtype
)(
a
.
shape
[
0
],
a
.
shape
[
1
],
b
.
shape
[
2
])
c
=
tensor
.
AllocEmpty
(
a
.
dtype
)(
a
.
shape
[
0
],
a
.
shape
[
1
],
b
.
shape
[
2
])
return
gpugemmbatch_no_inplace
(
c
,
1.0
,
a
,
b
,
0.0
)
return
gpugemmbatch_no_inplace
(
c
,
1.0
,
a
,
b
,
0.0
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Dot
])
@op_lifter
([
tensor
.
basic
.
Dot
])
def
local_gpua_hgemm
(
node
,
context_name
):
@register_opt2
([
tensor
.
basic
.
Dot
],
'fast_compile'
)
def
local_gpua_hgemm
(
node
,
context_name
,
inputs
):
from
theano.sandbox.cuda
import
nvcc_compiler
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
_logger
.
warning
(
"Not performing dot of float16 on the GPU since "
_logger
.
warning
(
"Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up "
"cuda 7.5 is not available. Updating could speed up "
"your code."
)
"your code."
)
return
return
A
=
node
.
inputs
[
0
]
A
=
inputs
[
0
]
B
=
node
.
inputs
[
1
]
B
=
inputs
[
1
]
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
fgraph
=
node
.
inputs
[
0
]
.
fgraph
fgraph
=
inputs
[
0
]
.
fgraph
C
=
GpuAllocEmpty
(
dtype
=
'float16'
,
context_name
=
context_name
)(
C
=
GpuAllocEmpty
(
dtype
=
'float16'
,
context_name
=
context_name
)(
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
shape_i
(
B
,
1
,
fgraph
))
...
@@ -941,8 +997,9 @@ def local_gpua_dot22(node, context_name):
...
@@ -941,8 +997,9 @@ def local_gpua_dot22(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Dot22Scalar
])
@op_lifter
([
tensor
.
blas
.
Dot22Scalar
])
def
local_gpua_dot22scalar
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
Dot22Scalar
],
'fast_compile'
)
x
,
y
,
a
=
node
.
inputs
def
local_gpua_dot22scalar
(
node
,
context_name
,
inputs
):
x
,
y
,
a
=
inputs
x
=
as_gpuarray_variable
(
x
,
context_name
)
x
=
as_gpuarray_variable
(
x
,
context_name
)
y
=
as_gpuarray_variable
(
y
,
context_name
)
y
=
as_gpuarray_variable
(
y
,
context_name
)
z
=
GpuAllocEmpty
(
x
.
dtype
,
context_name
)(
x
.
shape
[
0
],
y
.
shape
[
1
])
z
=
GpuAllocEmpty
(
x
.
dtype
,
context_name
)(
x
.
shape
[
0
],
y
.
shape
[
1
])
...
@@ -951,30 +1008,35 @@ def local_gpua_dot22scalar(node, context_name):
...
@@ -951,30 +1008,35 @@ def local_gpua_dot22scalar(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Eye
])
@op_lifter
([
tensor
.
basic
.
Eye
])
@register_opt2
([
tensor
.
basic
.
Eye
],
'fast_compile'
)
def
local_gpua_eye
(
node
,
context_name
):
def
local_gpua_eye
(
node
,
context_name
):
return
GpuEye
(
dtype
=
node
.
op
.
dtype
,
context_name
=
context_name
)
return
GpuEye
(
dtype
=
node
.
op
.
dtype
,
context_name
=
context_name
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
@register_opt2
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
'fast_compile'
)
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
,
context_name
):
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
,
context_name
):
return
gpu_crossentropy_softmax_argmax_1hot_with_bias
return
gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
@register_opt2
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
'fast_compile'
)
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
,
context_name
):
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
,
context_name
):
return
gpu_crossentropy_softmax_1hot_with_bias_dx
return
gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
@register_opt2
([
tensor
.
nnet
.
Softmax
],
'fast_compile'
)
def
local_gpua_softmax
(
node
,
context_name
):
def
local_gpua_softmax
(
node
,
context_name
):
return
gpu_softmax
return
gpu_softmax
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
@register_opt2
([
tensor
.
nnet
.
SoftmaxWithBias
],
'fast_compile'
)
def
local_gpua_softmaxwithbias
(
node
,
context_name
):
def
local_gpua_softmaxwithbias
(
node
,
context_name
):
return
gpu_softmax_with_bias
return
gpu_softmax_with_bias
...
@@ -987,8 +1049,6 @@ def local_assert(node, context_name):
...
@@ -987,8 +1049,6 @@ def local_assert(node, context_name):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
return
[
host_from_gpu
(
node
.
op
(
as_gpuarray_variable
(
node
.
inputs
[
0
],
return
[
host_from_gpu
(
node
.
op
(
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
),
*
node
.
inputs
[
1
:]))]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
...
@@ -1004,6 +1064,7 @@ theano.tensor.nnet.conv2d()
...
@@ -1004,6 +1064,7 @@ theano.tensor.nnet.conv2d()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
SparseBlockGemv
])
@op_lifter
([
SparseBlockGemv
])
@register_opt2
([
SparseBlockGemv
],
'fast_compile'
)
def
local_lift_sparseblockgemv
(
node
,
context_name
):
def
local_lift_sparseblockgemv
(
node
,
context_name
):
if
node
.
op
.
inplace
:
if
node
.
op
.
inplace
:
return
gpu_sparse_block_gemv_inplace
return
gpu_sparse_block_gemv_inplace
...
@@ -1013,6 +1074,7 @@ def local_lift_sparseblockgemv(node, context_name):
...
@@ -1013,6 +1074,7 @@ def local_lift_sparseblockgemv(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
SparseBlockOuter
])
@op_lifter
([
SparseBlockOuter
])
@register_opt2
([
SparseBlockOuter
],
'fast_compile'
)
def
local_lift_sparseblockouter
(
node
,
context_name
):
def
local_lift_sparseblockouter
(
node
,
context_name
):
if
node
.
op
.
inplace
:
if
node
.
op
.
inplace
:
return
gpu_sparse_block_outer_inplace
return
gpu_sparse_block_outer_inplace
...
@@ -1039,14 +1101,17 @@ def local_inplace_sparseblockouter(node):
...
@@ -1039,14 +1101,17 @@ def local_inplace_sparseblockouter(node):
@op_lifter
([
AbstractConv2d
,
@op_lifter
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
])
AbstractConv2d_gradInputs
])
def
local_lift_abstractconv2d
(
node
,
context_name
):
@register_opt2
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
],
'fast_compile'
)
def
local_lift_abstractconv2d
(
node
,
context_name
,
inputs
):
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
GpuArrayType
):
# Don't handle this node here, it's already on the GPU.
# Don't handle this node here, it's already on the GPU.
return
return
inps
=
list
(
node
.
inputs
)
inps
=
list
(
inputs
)
inps
[
0
]
=
as_gpuarray_variable
(
node
.
inputs
[
0
],
inps
[
0
]
=
as_gpuarray_variable
(
inputs
[
0
],
context_name
=
context_name
)
context_name
=
context_name
)
inps
[
1
]
=
as_gpuarray_variable
(
node
.
inputs
[
1
],
inps
[
1
]
=
as_gpuarray_variable
(
inputs
[
1
],
context_name
=
context_name
)
context_name
=
context_name
)
return
[
node
.
op
(
*
inps
)]
return
[
node
.
op
(
*
inps
)]
...
@@ -1155,6 +1220,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
...
@@ -1155,6 +1220,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt
(
'scan'
,
'fast_compile'
)
@register_opt
(
'scan'
,
'fast_compile'
)
@op_lifter
([
scan_op
.
Scan
])
@op_lifter
([
scan_op
.
Scan
])
#@register_opt2([scan_op.Scan], 'fast_compile')
def
local_scan_to_gpua
(
node
,
context_name
):
def
local_scan_to_gpua
(
node
,
context_name
):
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
if
info
.
get
(
'gpua'
,
False
):
if
info
.
get
(
'gpua'
,
False
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论