Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c887bc14
提交
c887bc14
authored
9月 16, 2015
作者:
Arnaud Bergeron
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix most of opt.py for type context.
Only scan is left.
上级
29615c83
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
171 行增加
和
118 行删除
+171
-118
opt.py
theano/sandbox/gpuarray/opt.py
+171
-118
没有找到文件。
theano/sandbox/gpuarray/opt.py
浏览文件 @
c887bc14
...
@@ -3,11 +3,6 @@ import numpy
...
@@ -3,11 +3,6 @@ import numpy
import
logging
import
logging
from
six.moves
import
xrange
from
six.moves
import
xrange
try
:
import
pygpu
except
ImportError
:
pass
import
theano
import
theano
from
theano
import
tensor
,
scalar
,
gof
from
theano
import
tensor
,
scalar
,
gof
from
theano.compile
import
optdb
from
theano.compile
import
optdb
...
@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
...
@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from
theano.tensor.nnet.conv
import
ConvOp
from
theano.tensor.nnet.conv
import
ConvOp
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
from
.type
import
GpuArrayType
,
GpuArrayConstant
from
.type
import
GpuArrayType
,
GpuArrayConstant
,
get_context
from
.basic_ops
import
(
as_gpuarray_variable
,
from
.basic_ops
import
(
as_gpuarray_variable
,
host_from_gpu
,
gpu_from_host
,
host_from_gpu
,
GpuToGpu
,
HostFromGpu
,
GpuFromHost
,
HostFromGpu
,
GpuFromHost
,
GpuSplit
,
GpuContiguous
,
GpuSplit
,
GpuContiguous
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuEye
,
gpu_join
,
GpuJoin
)
GpuEye
,
gpu_join
,
GpuJoin
)
from
.blas
import
(
gpu_dot22
,
GpuGemv
,
GpuGemm
,
GpuGer
,
from
.blas
import
(
gpu_dot22
,
GpuGemv
,
GpuGemm
,
GpuGer
,
gpugemm_no_inplace
)
gpugemm_no_inplace
)
...
@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
...
@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
'unsafe'
)
'unsafe'
)
def
safe_to_gpu
(
x
):
def
safe_to_gpu
(
x
,
ctx_name
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
gpu_from_host
(
x
)
return
GpuFromHost
(
ctx_name
)
(
x
)
else
:
else
:
return
x
return
x
...
@@ -102,24 +97,49 @@ def op_lifter(OP, cuda_only=False):
...
@@ -102,24 +97,49 @@ def op_lifter(OP, cuda_only=False):
"""
"""
def
f
(
maker
):
def
f
(
maker
):
def
local_opt
(
node
):
def
local_opt
(
node
):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
if
cuda_only
and
not
dev
.
startswith
(
'cuda'
):
return
if
type
(
node
.
op
)
in
OP
:
if
type
(
node
.
op
)
in
OP
:
# Either one of our inputs is on the gpu or
# Either one of our inputs is on the gpu or
# all of our client are on the gpu
# all of our clients are on the gpu
if
(
any
([
i
.
owner
and
i
.
owner
.
op
==
host_from_gpu
replace
=
False
for
i
in
node
.
inputs
])
or
# TODO: Maybe set context_name with infer_context_name()?
all
([
c
!=
'output'
and
c
.
op
==
gpu_from_host
context_name
=
None
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
])):
# We replace if any input is a host_from_gpu
new_op
=
maker
(
node
)
for
i
in
node
.
inputs
:
# This is needed as sometimes new_op inherit from OP.
if
i
.
owner
and
i
.
owner
.
op
==
host_from_gpu
:
context_name
=
i
.
owner
.
inputs
[
0
]
.
type
.
context_name
replace
=
True
break
if
not
replace
:
# We replace if *all* clients are on the GPU
clients
=
[
c
for
o
in
node
.
outputs
for
c
in
o
.
clients
]
replace
=
len
(
clients
)
!=
0
for
c
,
idx
in
clients
:
if
(
c
==
'output'
or
not
isinstance
(
c
.
op
,
GpuFromHost
)):
replace
=
False
# TODO: check that the clients want the same context?
if
replace
:
# All clients are GpuFromHost and we have at least one
context_name
=
clients
[
0
][
0
]
.
op
.
context_name
# Check if we should replace
if
(
not
replace
or
(
cuda_only
and
get_context
(
context_name
)
.
kind
!=
'cuda'
)):
return
False
new_op
=
maker
(
node
,
context_name
)
# This is needed as sometimes new_op inherits from OP.
if
new_op
and
new_op
!=
node
.
op
:
if
new_op
and
new_op
!=
node
.
op
:
if
isinstance
(
new_op
,
theano
.
Op
):
if
isinstance
(
new_op
,
theano
.
Op
):
# tag the inputs with the context in case
# the context was derived from the outputs
def
tag
(
i
,
ctx
):
i
.
tag
.
context_name
=
ctx
return
i
inputs
=
[
tag
(
i
,
context_name
)
for
i
in
node
.
inputs
]
return
[
safe_to_cpu
(
o
)
for
o
in
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
(
*
node
.
inputs
,
return_list
=
True
)]
new_op
(
*
inputs
,
return_list
=
True
)]
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
]
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
]
else
:
# suppose it is a variable on the GPU
else
:
# suppose it is a variable on the GPU
...
@@ -146,35 +166,80 @@ class InputToGpuOptimizer(Optimizer):
...
@@ -146,35 +166,80 @@ class InputToGpuOptimizer(Optimizer):
if
(
len
(
input
.
clients
)
==
1
and
if
(
len
(
input
.
clients
)
==
1
and
(
input
.
clients
[
0
][
0
]
==
'output'
or
(
input
.
clients
[
0
][
0
]
==
'output'
or
i
nput
.
clients
[
0
][
0
]
.
op
==
gpu_from_host
)):
i
sinstance
(
input
.
clients
[
0
][
0
]
.
op
,
GpuFromHost
)
)):
continue
continue
try
:
try
:
new_input
=
host_from_gpu
(
gpu_from_host
(
input
))
ctx
=
getattr
(
input
.
tag
,
'context_name'
,
None
)
new_input
=
host_from_gpu
(
GpuFromHost
(
ctx
)(
input
))
fgraph
.
replace_validate
(
input
,
new_input
,
fgraph
.
replace_validate
(
input
,
new_input
,
"InputToGpuOptimizer"
)
"InputToGpuOptimizer"
)
except
TypeError
:
except
TypeError
:
# This could fail if the inputs are not TensorTypes
# This could fail if the inputs are not TensorTypes
pass
pass
except
ValueError
:
# If there is no context tag and no default context
# then it stays on the CPU
assert
ctx
is
None
pass
gpu_seqopt
.
register
(
'InputToGpuArrayOptimizer'
,
InputToGpuOptimizer
(),
gpu_seqopt
.
register
(
'InputToGpuArrayOptimizer'
,
InputToGpuOptimizer
(),
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
@local_optimizer
([
gpu_from_host
,
host_from_gpu
])
@local_optimizer
([
GpuFromHost
,
GpuToGpu
,
host_from_gpu
])
def
local_cut_gpu_host_gpu
(
node
):
def
local_cut_gpu_transfers
(
node
):
if
tensor
.
opt
.
opt
.
check_chain
(
node
,
gpu_from_host
,
host_from_gpu
):
# gpu[ab] -> host -> gpub
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
if
(
isinstance
(
node
.
op
,
GpuFromHost
)
and
if
tensor
.
opt
.
opt
.
check_chain
(
node
,
host_from_gpu
,
gpu_from_host
):
node
.
inputs
[
0
]
.
owner
and
return
[
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]]
node
.
inputs
[
0
]
.
owner
.
op
==
host_from_gpu
):
return
False
other
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_host_gpu
,
if
node
.
op
.
context_name
==
other
.
type
.
context_name
:
return
[
other
]
else
:
return
[
GpuToGpu
(
node
.
op
.
context_name
)(
other
)]
# ? -> gpua -> host
elif
(
node
.
op
==
host_from_gpu
and
node
.
inputs
[
0
]
.
owner
):
n2
=
node
.
inputs
[
0
]
.
owner
# host ->
if
isinstance
(
n2
.
op
,
GpuFromHost
):
return
[
n2
.
inputs
[
0
]]
# gpub ->
if
isinstance
(
n2
.
op
,
GpuToGpu
):
return
[
host_from_gpu
(
n2
.
inputs
[
0
])]
# ? -> gpua -> gpub
elif
isinstance
(
node
.
op
,
GpuToGpu
):
# Transfer within same context
if
node
.
inputs
[
0
]
.
type
.
context_name
==
node
.
op
.
context_name
:
return
[
node
.
inputs
[
0
]]
if
node
.
inputs
[
0
]
.
owner
:
n2
=
node
.
inputs
[
0
]
.
owner
# host ->
if
isinstance
(
n2
.
op
,
GpuFromHost
):
return
[
GpuFromHost
(
node
.
op
.
context_name
)(
n2
.
inputs
[
0
])]
# gpuc ->
if
isinstance
(
n2
.
op
,
GpuToGpu
):
if
node
.
op
.
context_name
==
n2
.
inputs
[
0
]
.
type
.
context_name
:
return
[
n2
.
inputs
[
0
]]
else
:
return
[
node
.
op
(
n2
.
inputs
[
0
])]
gpu_cut_copies
.
register
(
'cut_gpua_host_transfers'
,
local_cut_gpu_transfers
,
'fast_compile'
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'inplace'
,
'gpuarray'
)
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
gpu_cut_copies
.
register
(
'cut_gpua_constant_transfers'
,
tensor
.
opt
.
constant_folding
,
tensor
.
opt
.
constant_folding
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
optdb
[
'canonicalize'
]
.
register
(
'local_cut_gpua_host_gpua'
,
local_cut_gpu_
host_gpu
,
local_cut_gpu_
transfers
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
...
@@ -187,6 +252,11 @@ def local_gpuaalloc2(node):
...
@@ -187,6 +252,11 @@ def local_gpuaalloc2(node):
Moves an alloc that is an input to join to the gpu.
Moves an alloc that is an input to join to the gpu.
"""
"""
try
:
get_context
(
None
)
except
ValueError
:
# If there is no default context then we do not perform the move here.
return
if
(
isinstance
(
node
.
op
,
tensor
.
Alloc
)
and
if
(
isinstance
(
node
.
op
,
tensor
.
Alloc
)
and
all
(
c
!=
'output'
and
all
(
c
!=
'output'
and
c
.
op
==
tensor
.
join
and
c
.
op
==
tensor
.
join
and
...
@@ -194,23 +264,13 @@ def local_gpuaalloc2(node):
...
@@ -194,23 +264,13 @@ def local_gpuaalloc2(node):
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
for
i
in
c
.
inputs
[
1
:])
for
i
in
c
.
inputs
[
1
:])
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
)):
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
)):
return
[
host_from_gpu
(
gpu_alloc
(
*
node
.
inputs
))]
return
[
host_from_gpu
(
GpuAlloc
(
None
)
(
*
node
.
inputs
))]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Alloc
])
@op_lifter
([
tensor
.
Alloc
])
def
local_gpuaalloc
(
node
):
def
local_gpuaalloc
(
node
,
context_name
):
new_out
=
gpu_alloc
(
*
node
.
inputs
)
return
GpuAlloc
(
context_name
)(
*
node
.
inputs
)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if
new_out
.
broadcastable
!=
node
.
outputs
[
0
]
.
broadcastable
:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for
b_old
,
b_new
in
zip
(
node
.
outputs
[
0
]
.
broadcastable
,
new_out
.
broadcastable
):
assert
b_new
or
(
not
b_old
)
new_out
=
tensor
.
patternbroadcast
(
new_out
,
node
.
outputs
[
0
]
.
broadcastable
)
return
(
new_out
,)
@register_opt
()
@register_opt
()
...
@@ -221,8 +281,8 @@ def local_gpualloc_memset_0(node):
...
@@ -221,8 +281,8 @@ def local_gpualloc_memset_0(node):
if
(
isinstance
(
inp
,
GpuArrayConstant
)
and
if
(
isinstance
(
inp
,
GpuArrayConstant
)
and
inp
.
data
.
size
==
1
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_o
ut
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
new_o
p
=
GpuAlloc
(
node
.
op
.
context_name
,
memset_0
=
True
)
return
[
new_o
ut
]
return
[
new_o
p
(
*
node
.
inputs
)
]
@register_opt
()
@register_opt
()
...
@@ -240,7 +300,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
...
@@ -240,7 +300,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Reshape
])
@op_lifter
([
tensor
.
Reshape
])
def
local_gpureshape
(
node
):
def
local_gpureshape
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
name
=
op
.
name
name
=
op
.
name
if
name
:
if
name
:
...
@@ -251,14 +311,14 @@ def local_gpureshape(node):
...
@@ -251,14 +311,14 @@ def local_gpureshape(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Rebroadcast
])
@op_lifter
([
tensor
.
Rebroadcast
])
def
local_gpu_rebroadcast
(
node
):
def
local_gpu_rebroadcast
(
node
,
context_name
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
if
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
):
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
return
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
])
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Flatten
])
@op_lifter
([
tensor
.
Flatten
])
def
local_gpuflatten
(
node
):
def
local_gpuflatten
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
shp
=
[]
shp
=
[]
if
op
.
outdim
!=
1
:
if
op
.
outdim
!=
1
:
...
@@ -271,7 +331,7 @@ def local_gpuflatten(node):
...
@@ -271,7 +331,7 @@ def local_gpuflatten(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Elemwise
])
@op_lifter
([
tensor
.
Elemwise
])
def
local_gpu_elemwise
(
node
):
def
local_gpu_elemwise
(
node
,
context_name
):
op
=
node
.
op
op
=
node
.
op
scal_op
=
op
.
scalar_op
scal_op
=
op
.
scalar_op
name
=
op
.
name
name
=
op
.
name
...
@@ -344,28 +404,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
...
@@ -344,28 +404,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
DimShuffle
])
@op_lifter
([
tensor
.
DimShuffle
])
def
local_gpua_dimshuffle
(
node
):
def
local_gpua_dimshuffle
(
node
,
context_name
):
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
node
.
op
.
new_order
)
node
.
op
.
new_order
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
SpecifyShape
])
@op_lifter
([
tensor
.
SpecifyShape
])
def
local_gpua_specifyShape
(
node
):
def
local_gpua_specifyShape
(
node
,
context_name
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
inp
=
[
gpu_from_host
(
node
.
inputs
[
0
])]
+
node
.
inputs
[
1
:]
inp
=
[
GpuFromHost
(
context_name
)
(
node
.
inputs
[
0
])]
+
node
.
inputs
[
1
:]
return
tensor
.
specify_shape
(
*
inp
)
return
tensor
.
specify_shape
(
*
inp
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
def
local_gpua_shape
(
node
):
def
local_gpua_shape
(
node
,
context_name
):
# op_lifter will call this opt too frequently as the output is
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
# always on the CPU.
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
return
[
gpu_from_host
(
node
.
inputs
[
0
])
.
shape
]
return
[
GpuFromHost
(
context_name
)
(
node
.
inputs
[
0
])
.
shape
]
def
gpu_print_wrapper
(
op
,
cnda
):
def
gpu_print_wrapper
(
op
,
cnda
):
...
@@ -374,7 +434,7 @@ def gpu_print_wrapper(op, cnda):
...
@@ -374,7 +434,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
printing
.
Print
])
@op_lifter
([
tensor
.
printing
.
Print
])
def
local_gpu_print_op
(
node
):
def
local_gpu_print_op
(
node
,
context_name
):
x
,
=
node
.
inputs
x
,
=
node
.
inputs
gpu_x
,
=
x
.
owner
.
inputs
gpu_x
,
=
x
.
owner
.
inputs
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
...
@@ -404,9 +464,13 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -404,9 +464,13 @@ def local_gpu_pdbbreakpoint_op(node):
input_is_from_gpu
=
(
inp
.
owner
and
input_is_from_gpu
=
(
inp
.
owner
and
isinstance
(
inp
.
owner
.
op
,
HostFromGpu
))
isinstance
(
inp
.
owner
.
op
,
HostFromGpu
))
output_goes_to_gpu
=
any
([
c
[
0
]
!=
"output"
and
for
c
in
out
.
clients
:
isinstance
(
c
[
0
]
.
op
,
GpuFromHost
)
if
c
==
'output'
:
for
c
in
out
.
clients
])
continue
if
isinstance
(
c
[
0
]
.
op
,
GpuFromHost
):
output_goes_to_gpu
=
True
context_name
=
c
[
0
]
.
op
.
context_name
break
if
input_is_from_gpu
:
if
input_is_from_gpu
:
# The op should be applied on the GPU version of the input
# The op should be applied on the GPU version of the input
...
@@ -415,7 +479,7 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -415,7 +479,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif
output_goes_to_gpu
:
elif
output_goes_to_gpu
:
# The input should be transfered to the gpu
# The input should be transfered to the gpu
new_inputs
.
append
(
gpu_from_host
(
inp
))
new_inputs
.
append
(
GpuFromHost
(
context_name
)
(
inp
))
input_transfered
.
append
(
True
)
input_transfered
.
append
(
True
)
else
:
else
:
...
@@ -447,7 +511,7 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -447,7 +511,7 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Join
])
@op_lifter
([
tensor
.
Join
])
def
local_gpua_join
(
node
):
def
local_gpua_join
(
node
,
context_name
):
return
gpu_join
return
gpu_join
...
@@ -462,13 +526,13 @@ def local_gpuajoin_1(node):
...
@@ -462,13 +526,13 @@ def local_gpuajoin_1(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Split
])
@op_lifter
([
tensor
.
Split
])
def
local_gpua_split
(
node
):
def
local_gpua_split
(
node
,
context_name
):
return
GpuSplit
(
node
.
op
.
len_splits
)
return
GpuSplit
(
node
.
op
.
len_splits
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Subtensor
])
@op_lifter
([
tensor
.
Subtensor
])
def
local_gpua_subtensor
(
node
):
def
local_gpua_subtensor
(
node
,
context_name
):
x
=
node
.
inputs
[
0
]
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
gpu_x
=
x
.
owner
.
inputs
[
0
]
gpu_x
=
x
.
owner
.
inputs
[
0
]
...
@@ -482,14 +546,14 @@ def local_gpua_subtensor(node):
...
@@ -482,14 +546,14 @@ def local_gpua_subtensor(node):
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
return
return
else
:
else
:
return
[
host_from_gpu
(
gpu_
from_host
(
node
.
outputs
[
0
]))]
return
[
host_from_gpu
(
gpu_
x
.
owner
.
op
(
node
.
outputs
[
0
]))]
return
GpuSubtensor
(
node
.
op
.
idx_list
)
return
GpuSubtensor
(
node
.
op
.
idx_list
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
IncSubtensor
])
@op_lifter
([
tensor
.
IncSubtensor
])
def
local_gpua_incsubtensor
(
node
):
def
local_gpua_incsubtensor
(
node
,
context_name
):
return
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
return
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
node
.
op
.
set_instead_of_inc
,
node
.
op
.
set_instead_of_inc
,
node
.
op
.
destroyhandler_tolerate_aliased
)
node
.
op
.
destroyhandler_tolerate_aliased
)
...
@@ -497,16 +561,16 @@ def local_gpua_incsubtensor(node):
...
@@ -497,16 +561,16 @@ def local_gpua_incsubtensor(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
def
local_gpua_advanced_subtensor
(
node
):
def
local_gpua_advanced_subtensor
(
node
,
context_name
):
return
GpuAdvancedSubtensor1
()
return
GpuAdvancedSubtensor1
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
def
local_gpua_advanced_incsubtensor
(
node
):
def
local_gpua_advanced_incsubtensor
(
node
,
context_name
):
# This
optimization is disabled if cuda is not active
# This
is disabled on non-cuda contexts
if
pygpu
.
get_default_context
()
.
kind
!=
"cuda"
:
if
get_context
(
context_name
)
.
kind
!=
'cuda'
:
return
None
return
None
x
,
y
,
ilist
=
node
.
inputs
x
,
y
,
ilist
=
node
.
inputs
...
@@ -535,17 +599,19 @@ def local_gpua_advanced_incsubtensor(node):
...
@@ -535,17 +599,19 @@ def local_gpua_advanced_incsubtensor(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
def
local_gpua_careduce
(
node
):
def
local_gpua_careduce
(
node
,
context_name
):
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
scalar
.
Maximum
,
scalar
.
Minimum
)):
scalar
.
Maximum
,
scalar
.
Minimum
)):
dev
=
theano
.
sandbox
.
gpuarray
.
init_dev
.
device
ctx
=
get_context
(
context_name
)
if
dev
.
startswith
(
'opencl'
)
:
if
ctx
.
kind
==
'opencl'
:
op
=
GpuCAReduceCPY
op
=
GpuCAReduceCPY
if
node
.
op
.
scalar_op
not
in
[
scalar
.
add
,
scalar
.
mul
]:
if
node
.
op
.
scalar_op
not
in
[
scalar
.
add
,
scalar
.
mul
]:
# We don't support yet all reduction with cpy code.
# We don't support yet all reduction with cpy code.
return
return
el
se
:
el
if
ctx
.
kind
==
'cuda'
:
op
=
GpuCAReduceCuda
op
=
GpuCAReduceCuda
else
:
return
False
x
,
=
node
.
inputs
x
,
=
node
.
inputs
greduce
=
op
(
greduce
=
op
(
...
@@ -556,7 +622,7 @@ def local_gpua_careduce(node):
...
@@ -556,7 +622,7 @@ def local_gpua_careduce(node):
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
if
(
op
is
GpuCAReduceCPY
or
if
(
op
is
GpuCAReduceCPY
or
gvar
.
owner
.
op
.
supports_c_code
([
gpu_from_host
(
x
)])):
gvar
.
owner
.
op
.
supports_c_code
([
GpuFromHost
(
context_name
)
(
x
)])):
return
greduce
return
greduce
else
:
else
:
# Try to make a simpler pattern based on reshaping
# Try to make a simpler pattern based on reshaping
...
@@ -596,7 +662,7 @@ def local_gpua_careduce(node):
...
@@ -596,7 +662,7 @@ def local_gpua_careduce(node):
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
gpu_reshaped_x
=
gpu_from_host
(
reshaped_x
)
gpu_reshaped_x
=
GpuFromHost
(
context_name
)
(
reshaped_x
)
gvar
=
greduce
(
gpu_reshaped_x
)
gvar
=
greduce
(
gpu_reshaped_x
)
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
...
@@ -615,13 +681,13 @@ def local_gpua_careduce(node):
...
@@ -615,13 +681,13 @@ def local_gpua_careduce(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
def
local_gpua_gemv
(
node
):
def
local_gpua_gemv
(
node
,
context_name
):
return
GpuGemv
(
inplace
=
node
.
op
.
inplace
)
return
GpuGemv
(
inplace
=
node
.
op
.
inplace
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemm
])
@op_lifter
([
tensor
.
blas
.
Gemm
])
def
local_gpua_gemm
(
node
):
def
local_gpua_gemm
(
node
,
context_name
):
return
GpuGemm
(
inplace
=
node
.
op
.
inplace
)
return
GpuGemm
(
inplace
=
node
.
op
.
inplace
)
...
@@ -658,49 +724,49 @@ def local_gpuagemm_output_merge(node, *inputs):
...
@@ -658,49 +724,49 @@ def local_gpuagemm_output_merge(node, *inputs):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
def
local_gpua_ger
(
node
):
def
local_gpua_ger
(
node
,
context_name
):
return
GpuGer
(
destructive
=
node
.
op
.
destructive
)
return
GpuGer
(
destructive
=
node
.
op
.
destructive
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Dot22
])
@op_lifter
([
tensor
.
blas
.
Dot22
])
def
local_gpua_dot22
(
node
):
def
local_gpua_dot22
(
node
,
context_name
):
return
gpu_dot22
return
gpu_dot22
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Eye
])
@op_lifter
([
tensor
.
basic
.
Eye
])
def
local_gpua_eye
(
node
):
def
local_gpua_eye
(
node
,
context_name
):
return
GpuEye
(
dtype
=
node
.
op
.
dtype
)
return
GpuEye
(
dtype
=
node
.
op
.
dtype
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
):
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
,
context_name
):
return
GpuCrossentropySoftmaxArgmax1HotWithBias
()
return
GpuCrossentropySoftmaxArgmax1HotWithBias
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
):
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
,
context_name
):
return
GpuCrossentropySoftmax1HotWithBiasDx
()
return
GpuCrossentropySoftmax1HotWithBiasDx
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
def
local_gpua_softmax
(
node
):
def
local_gpua_softmax
(
node
,
context_name
):
return
GpuSoftmax
()
return
GpuSoftmax
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
def
local_gpua_softmaxwithbias
(
node
):
def
local_gpua_softmaxwithbias
(
node
,
context_name
):
return
GpuSoftmaxWithBias
()
return
GpuSoftmaxWithBias
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
tensor
.
opt
.
Assert
])
@op_lifter
([
theano
.
tensor
.
opt
.
Assert
])
def
local_assert
(
node
):
def
local_assert
(
node
,
context_name
):
if
(
node
.
inputs
[
0
]
.
owner
and
if
(
node
.
inputs
[
0
]
.
owner
and
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
)):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
,
HostFromGpu
)):
return
[
host_from_gpu
(
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
],
return
[
host_from_gpu
(
node
.
op
(
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
],
...
@@ -708,21 +774,14 @@ def local_assert(node):
...
@@ -708,21 +774,14 @@ def local_assert(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
gpu_from_host
,
ConvOp
])
@op_lifter
([
ConvOp
])
def
local_gpu_conv
(
node
):
def
local_gpu_conv
(
node
,
context_name
):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
def
GpuConvOp_from_ConvOp
(
op
):
def
GpuConvOp_from_ConvOp
(
op
):
logical_img_hw
=
None
logical_img_hw
=
None
if
op
.
kshp_logical
is
not
None
and
op
.
kshp_logical
!=
op
.
kshp
:
if
op
.
kshp_logical
is
not
None
and
op
.
kshp_logical
!=
op
.
kshp
:
return
None
return
None
# print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw
ret
=
GpuConv
(
border_mode
=
op
.
out_mode
,
ret
=
GpuConv
(
border_mode
=
op
.
out_mode
,
subsample
=
(
op
.
dx
,
op
.
dy
),
subsample
=
(
op
.
dx
,
op
.
dy
),
logical_img_hw
=
logical_img_hw
,
logical_img_hw
=
logical_img_hw
,
...
@@ -735,13 +794,10 @@ def local_gpu_conv(node):
...
@@ -735,13 +794,10 @@ def local_gpu_conv(node):
imshp
=
op
.
imshp
,
imshp
=
op
.
imshp
,
nkern
=
op
.
nkern
,
nkern
=
op
.
nkern
,
bsize
=
op
.
bsize
,
bsize
=
op
.
bsize
,
fft_opt
=
op
.
fft_opt
fft_opt
=
op
.
fft_opt
)
)
if
op
.
imshp_logical
is
not
None
:
if
op
.
imshp_logical
is
not
None
:
logical_img_hw
=
op
.
imshp_logical
[
1
:
3
]
logical_img_hw
=
op
.
imshp_logical
[
1
:
3
]
if
logical_img_hw
!=
op
.
imshp
[
1
:
3
]:
if
logical_img_hw
!=
op
.
imshp
[
1
:
3
]:
# this case is not implemented
# return None
rstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
1
]
/
rstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
1
]
/
float
(
op
.
imshp
[
1
])))
float
(
op
.
imshp
[
1
])))
cstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
2
]
/
cstride
=
int
(
numpy
.
ceil
(
op
.
imshp_logical
[
2
]
/
...
@@ -752,7 +808,7 @@ def local_gpu_conv(node):
...
@@ -752,7 +808,7 @@ def local_gpu_conv(node):
img
.
shape
[
0
],
*
op
.
imshp_logical
)
img
.
shape
[
0
],
*
op
.
imshp_logical
)
img
=
tensor
.
set_subtensor
(
buf
[:,
:,
::
rstride
,
::
cstride
],
img
=
tensor
.
set_subtensor
(
buf
[:,
:,
::
rstride
,
::
cstride
],
img
)
img
)
img
=
gpu_from_host
(
img
)
img
=
GpuFromHost
(
context_name
)
(
img
)
return
ret
(
img
,
kern
)
return
ret
(
img
,
kern
)
return
make_graph
return
make_graph
...
@@ -779,15 +835,10 @@ def local_gpu_conv(node):
...
@@ -779,15 +835,10 @@ def local_gpu_conv(node):
gpu_conv
=
GpuConvOp_from_ConvOp
(
node
.
op
)
gpu_conv
=
GpuConvOp_from_ConvOp
(
node
.
op
)
if
gpu_conv
is
None
:
if
gpu_conv
is
None
:
return
return
out
=
gpu_conv
(
gpu_from_host
(
img
),
out
=
gpu_conv
(
GpuFromHost
(
context_name
)(
img
),
gpu_from_host
(
kern
))
GpuFromHost
(
context_name
)(
kern
))
# in some case the ConvOp broadcast the last 2 dimensions
# differently then the gpu ConvOp
out
=
tensor
.
patternbroadcast
(
host_from_gpu
(
out
),
node
.
outputs
[
0
]
.
broadcastable
)
# op_lifter want the output on the GPU.
# op_lifter want the output on the GPU.
out
=
gpu_from_host
(
out
)
out
=
GpuFromHost
(
context_name
)
(
out
)
out
.
values_eq_approx
=
values_eq_approx
out
.
values_eq_approx
=
values_eq_approx
return
[
out
]
return
[
out
]
...
@@ -818,9 +869,10 @@ def local_gpu_elemwise_careduce(node):
...
@@ -818,9 +869,10 @@ def local_gpu_elemwise_careduce(node):
pre_scalar_op
=
scalar
.
basic
.
sqr
)(
inp
)]
pre_scalar_op
=
scalar
.
basic
.
sqr
)(
inp
)]
def
tensor_to_gpu
(
x
):
def
tensor_to_gpu
(
x
,
context_name
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
y
=
GpuArrayType
(
broadcastable
=
x
.
type
.
broadcastable
,
y
=
GpuArrayType
(
broadcastable
=
x
.
type
.
broadcastable
,
context_name
=
context_name
,
dtype
=
x
.
type
.
dtype
)()
dtype
=
x
.
type
.
dtype
)()
if
x
.
name
:
if
x
.
name
:
y
.
name
=
x
.
name
+
'[Gpua]'
y
.
name
=
x
.
name
+
'[Gpua]'
...
@@ -842,6 +894,7 @@ def gpu_safe_new(x, tag=''):
...
@@ -842,6 +894,7 @@ def gpu_safe_new(x, tag=''):
nw_name
=
x
.
name
+
tag
nw_name
=
x
.
name
+
tag
else
:
else
:
nw_name
=
None
nw_name
=
None
if
isinstance
(
x
,
theano
.
Constant
):
if
isinstance
(
x
,
theano
.
Constant
):
return
x
.
clone
()
return
x
.
clone
()
...
@@ -870,7 +923,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
...
@@ -870,7 +923,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt
(
'scan'
,
'fast_compile'
)
@register_opt
(
'scan'
,
'fast_compile'
)
@op_lifter
([
scan_op
.
Scan
])
@op_lifter
([
scan_op
.
Scan
])
def
local_scan_to_gpua
(
node
):
def
local_scan_to_gpua
(
node
,
context_name
):
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
if
info
.
get
(
'gpua'
,
False
):
if
info
.
get
(
'gpua'
,
False
):
return
return
...
@@ -882,20 +935,20 @@ def local_scan_to_gpua(node):
...
@@ -882,20 +935,20 @@ def local_scan_to_gpua(node):
node
.
op
.
n_mit_sot
+
node
.
op
.
n_mit_sot
+
node
.
op
.
n_sit_sot
+
node
.
op
.
n_sit_sot
+
node
.
op
.
n_shared_outs
)
node
.
op
.
n_shared_outs
)
nw_ins
+=
[
safe_to_gpu
(
x
)
for
x
in
node
.
inputs
[
1
:
e
]]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
inputs
[
1
:
e
]]
b
=
e
b
=
e
e
=
e
+
node
.
op
.
n_nit_sot
e
=
e
+
node
.
op
.
n_nit_sot
nw_ins
+=
node
.
inputs
[
b
:
e
]
nw_ins
+=
node
.
inputs
[
b
:
e
]
nw_ins
+=
[
safe_to_gpu
(
x
)
for
x
in
node
.
inputs
[
e
:]]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
inputs
[
e
:]]
scan_ins
=
[
tensor_to_gpu
(
x
)
for
x
in
node
.
op
.
inputs
]
scan_ins
=
[
tensor_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
inputs
]
# The inner output corresponding to the looping condition should not be
# The inner output corresponding to the looping condition should not be
# moved to the gpu
# moved to the gpu
if
node
.
op
.
info
[
'as_while'
]:
if
node
.
op
.
info
[
'as_while'
]:
scan_outs
=
[
safe_to_gpu
(
x
)
for
x
in
node
.
op
.
outputs
[:
-
1
]]
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
outputs
[:
-
1
]]
scan_outs
+=
[
node
.
op
.
outputs
[
-
1
]]
scan_outs
+=
[
node
.
op
.
outputs
[
-
1
]]
else
:
else
:
scan_outs
=
[
safe_to_gpu
(
x
)
for
x
in
node
.
op
.
outputs
]
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
outputs
]
scan_outs
=
scan_utils
.
clone
(
scan_outs
=
scan_utils
.
clone
(
scan_outs
,
scan_outs
,
replace
=
list
(
zip
(
node
.
op
.
inputs
,
replace
=
list
(
zip
(
node
.
op
.
inputs
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论