Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
aeb8c035
提交
aeb8c035
authored
8月 20, 2015
作者:
Xavier Bouthillier
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix optimizations
上级
76b71018
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
213 行增加
和
297 行删除
+213
-297
blocksparse.py
theano/sandbox/blocksparse.py
+13
-50
opt.py
theano/sandbox/cuda/opt.py
+161
-85
opt.py
theano/sandbox/opt.py
+25
-153
test_blocksparse.py
theano/sandbox/tests/test_blocksparse.py
+3
-3
test_opt.py
theano/sandbox/tests/test_opt.py
+11
-6
没有找到文件。
theano/sandbox/blocksparse.py
浏览文件 @
aeb8c035
...
...
@@ -84,7 +84,19 @@ class SparseBlockGemv(Op):
return
Apply
(
self
,
[
o
,
W
,
h
,
inputIdx
,
outputIdx
],
[
output
])
def
perform
(
self
,
node
,
inp
,
out_
):
raise
NotImplementedError
(
'Optimization of SparseBlockGemv failed.'
)
o
,
W
,
h
,
iIdx
,
oIdx
=
inp
[:
5
]
if
not
self
.
inplace
:
o
=
o
.
copy
()
for
b
in
range
(
o
.
shape
[
0
]):
for
j
in
range
(
o
.
shape
[
1
]):
outputIdx
=
oIdx
[
b
,
j
]
for
i
in
range
(
h
.
shape
[
1
]):
inputIdx
=
iIdx
[
b
,
i
]
w
=
W
[
inputIdx
,
outputIdx
]
o
[
b
,
j
,
:]
+=
numpy
.
dot
(
h
[
b
,
i
],
w
)
out_
[
0
][
0
]
=
o
def
grad
(
self
,
inputs
,
grads
):
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
...
...
@@ -160,50 +172,6 @@ class SparseBlockOuter(Op):
return
Apply
(
self
,
[
o
,
x
,
y
,
xIdx
,
yIdx
,
alpha
],
[
output
])
def
perform
(
self
,
node
,
inp
,
out_
):
raise
NotImplementedError
(
'Optimization of SparseBlockOuter failed.'
)
def
grad
(
self
,
inputs
,
output_gradients
):
raise
NotImplementedError
(
"SparseBlockOuter has no gradient "
"implemented"
)
class
CpuSparseBlockGemv
(
SparseBlockGemv
):
"""
CPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
def
perform
(
self
,
node
,
inp
,
out_
):
o
,
W
,
h
,
iIdx
,
oIdx
=
inp
[:
5
]
if
not
self
.
inplace
:
o
=
o
.
copy
()
for
b
in
range
(
o
.
shape
[
0
]):
for
j
in
range
(
o
.
shape
[
1
]):
outputIdx
=
oIdx
[
b
,
j
]
for
i
in
range
(
h
.
shape
[
1
]):
inputIdx
=
iIdx
[
b
,
i
]
w
=
W
[
inputIdx
,
outputIdx
]
o
[
b
,
j
,
:]
+=
numpy
.
dot
(
h
[
b
,
i
],
w
)
out_
[
0
][
0
]
=
o
class
CpuSparseBlockOuter
(
SparseBlockOuter
):
"""
CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
def
perform
(
self
,
node
,
inp
,
out_
):
o
,
x
,
y
,
xIdx
,
yIdx
,
alpha
=
inp
[:
6
]
...
...
@@ -223,11 +191,6 @@ sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer
=
SparseBlockOuter
(
False
)
sparse_block_outer_inplace
=
SparseBlockOuter
(
True
)
cpu_sparse_block_gemv
=
CpuSparseBlockGemv
(
False
)
cpu_sparse_block_gemv_inplace
=
CpuSparseBlockGemv
(
True
)
cpu_sparse_block_outer
=
CpuSparseBlockOuter
(
False
)
cpu_sparse_block_outer_inplace
=
CpuSparseBlockOuter
(
True
)
def
sparse_block_dot
(
W
,
h
,
inputIdx
,
b
,
outputIdx
):
"""
...
...
theano/sandbox/cuda/opt.py
浏览文件 @
aeb8c035
...
...
@@ -18,10 +18,9 @@ import theano.ifelse
from
six.moves
import
reduce
,
xrange
from
theano.compile
import
optdb
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
ProxyDB
,
Optimizer
,
toolbox
)
Optimizer
,
TopoOptimizer
,
toolbox
)
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.opt
import
register_meta_opt
from
theano.sandbox.cuda.basic_ops
import
(
gpu_eye
,
gpu_contiguous
,
gpu_from_host
,
host_from_gpu
,
GpuFromHost
,
HostFromGpu
,
...
...
@@ -32,8 +31,8 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
,
GpuSplit
,
GpuAllocEmpty
)
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.blas
import
(
gpu_dot22
,
gpu_dot22scalar
,
gpu_gemm_inplace
,
gpu_gemm_no_inplace
,
GpuConv
,
from
theano.sandbox.cuda.blas
import
(
gpu_
dot22
,
gpu_dot22scalar
,
gpu_
gemm_inplace
,
gpu_gemm_no_inplace
,
GpuConv
,
GpuCorrMM
,
GpuCorrMM_gradInputs
,
GpuCorrMM_gradWeights
,
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
)
...
...
@@ -43,16 +42,22 @@ from theano.sandbox.cuda.cula import gpu_solve
from
theano.sandbox.cuda.blas
import
gpu_gemv_no_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_inplace
from
theano.sandbox.cuda.blas
import
gpu_ger_no_inplace
from
theano.sandbox.cuda.blas
import
(
GpuDownsampleFactorMax
,
GpuDownsampleFactorMaxGrad
,
GpuDownsampleFactorMaxGradGrad
)
from
theano.sandbox.cuda.blas
import
(
GpuDownsampleFactorMax
,
GpuDownsampleFactorMaxGrad
,
GpuDownsampleFactorMaxGradGrad
)
from
theano.sandbox.blocksparse
import
SparseBlockGemv
,
SparseBlockOuter
from
theano.sandbox.cuda.blocksparse
import
GpuSparseBlockGemv
,
GpuSparseBlockOuter
from
theano.sandbox.cuda.blocksparse
import
(
GpuSparseBlockGemv
,
GpuSparseBlockOuter
,
gpu_sparse_block_gemv_inplace
,
gpu_sparse_block_outer_inplace
)
from
theano.sandbox.cuda.nnet
import
(
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuSoftmax
,
GpuSoftmaxWithBias
)
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuSoftmax
,
GpuSoftmaxWithBias
)
from
theano.sandbox.cuda.elemwise
import
SupportCodeError
from
theano.scalar.basic_scipy
import
Erfinv
...
...
@@ -81,10 +86,11 @@ except ImportError:
gpu_cut_copies
=
EquilibriumDB
()
gpu_seqopt
.
register
(
'gpu_local_optimizations'
,
gpu_optimizer
,
1
,
'fast_run'
,
'fast_compile'
,
'inplace'
,
'gpu'
)
'fast_run'
,
'fast_compile'
,
'inplace'
,
'gpu'
)
gpu_seqopt
.
register
(
'gpu_cut_transfers'
,
gpu_cut_copies
,
2
,
'fast_run'
,
'fast_compile'
,
'gpu'
)
# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU!
'fast_run'
,
'fast_compile'
,
'gpu'
)
# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS
# enable the GPU!
optdb
.
register
(
'gpu_opt'
,
gpu_seqopt
,
optdb
.
__position__
.
get
(
'add_destroy_handler'
,
49.5
)
-
1
,
...
...
@@ -270,7 +276,7 @@ def local_gpu_elemwise_0(node):
'uint16'
])
# case 1 - all inputs are already float32
if
all
([
i
.
type
.
dtype
==
'float32'
for
i
in
node
.
inputs
]):
# TODO: change this when fusion makes Elemwise with
# TODO: change this when fusion makes Elemwise with
# multiple outputs
gpu_elemwise
=
new_op
(
*
(
gpu_from_host
(
i
)
for
i
in
node
.
inputs
))
...
...
@@ -350,8 +356,8 @@ def local_gpu_split(node):
any
([
c
!=
'output'
and
isinstance
(
c
.
op
,
GpuFromHost
)
for
c
,
idx
in
outs_clients
])):
new_op
=
GpuSplit
(
node
.
op
.
len_splits
)
split_res
=
new_op
(
as_cuda_ndarray_variable
(
input
),
*
node
.
inputs
[
1
:],
return_list
=
True
)
split_res
=
new_op
(
as_cuda_ndarray_variable
(
input
),
*
node
.
inputs
[
1
:],
return_list
=
True
)
return
[
host_from_gpu
(
o
)
for
o
in
split_res
]
return
False
...
...
@@ -378,7 +384,8 @@ def local_gpu_dimshuffle_0(node):
dimshuffle_node
=
host_input
.
owner
new_op
=
GpuDimShuffle
(
dimshuffle_node
.
op
.
input_broadcastable
,
dimshuffle_node
.
op
.
new_order
)
return
[
new_op
(
as_cuda_ndarray_variable
(
dimshuffle_node
.
inputs
[
0
]))]
return
[
new_op
(
as_cuda_ndarray_variable
(
dimshuffle_node
.
inputs
[
0
]))]
return
False
...
...
@@ -393,8 +400,8 @@ def local_gpu_specifyShape_0(node):
if
isinstance
(
node
.
op
,
tensor
.
SpecifyShape
):
input
=
node
.
inputs
[
0
]
if
input
.
owner
and
isinstance
(
input
.
owner
.
op
,
HostFromGpu
):
return
[
host_from_gpu
(
tensor
.
specify_shape
(
as_cuda_ndarray_variable
(
input
),
*
node
.
inputs
[
1
:]))]
return
[
host_from_gpu
(
tensor
.
specify_shape
(
as_cuda_ndarray_variable
(
input
),
*
node
.
inputs
[
1
:]))]
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
if
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
...
...
@@ -471,11 +478,15 @@ def local_gpu_dot_to_dot22(node):
shape_out
))]
return
False
@local_optimizer
(
None
)
def
local_assert_no_cpu_op
(
node
):
if
not
isinstance
(
node
.
op
,
GpuOp
)
and
all
([
var
.
owner
and
isinstance
(
var
.
owner
.
op
,
HostFromGpu
)
for
var
in
node
.
inputs
])
and
any
([[
c
for
c
in
var
.
clients
if
isinstance
(
c
[
0
]
.
op
,
GpuFromHost
)]
for
var
in
node
.
outputs
]):
if
(
not
isinstance
(
node
.
op
,
GpuOp
)
and
all
([
var
.
owner
and
isinstance
(
var
.
owner
.
op
,
HostFromGpu
)
for
var
in
node
.
inputs
])
and
any
([[
c
for
c
in
var
.
clients
if
isinstance
(
c
[
0
]
.
op
,
GpuFromHost
)]
for
var
in
node
.
outputs
])):
if
config
.
assert_no_cpu_op
==
"warn"
:
_logger
.
warning
((
"CPU op
%
s is detected in the computational"
" graph"
)
%
node
)
...
...
@@ -496,7 +507,7 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2)
@register_opt
()
@local_optimizer
([
theano
.
ifelse
.
IfElse
,
gpu_from_host
])
def
local_gpu_lazy_ifelse
(
node
):
"""
"""
gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
ifelse(host_from_gpu) -> host_from_gpu(ifelse)
...
...
@@ -576,7 +587,8 @@ def local_gpu_dot22(node):
if
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
blas
.
Dot22
):
x
,
y
=
host_input
.
owner
.
inputs
return
[
gpu_dot22
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
))]
return
[
gpu_dot22
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
))]
if
isinstance
(
node
.
op
,
tensor
.
blas
.
Dot22
):
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
for
i
in
node
.
inputs
]):
...
...
@@ -601,7 +613,8 @@ def local_gpu_dot22scalar(node):
isinstance
(
host_input
.
owner
.
op
,
tensor
.
blas
.
Dot22Scalar
)):
x
,
y
,
scalar
=
host_input
.
owner
.
inputs
return
[
gpu_dot22scalar
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
),
return
[
gpu_dot22scalar
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
),
tensor
.
blas
.
_as_scalar
(
scalar
))]
if
isinstance
(
node
.
op
,
tensor
.
blas
.
Dot22Scalar
):
if
any
([
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
)
...
...
@@ -629,7 +642,8 @@ def local_gpu_solve(node):
isinstance
(
host_input
.
owner
.
op
,
slinalg
.
Solve
)):
x
,
y
=
host_input
.
owner
.
inputs
return
[
gpu_solve
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
))]
return
[
gpu_solve
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
))]
if
isinstance
(
node
.
op
,
slinalg
.
Solve
):
if
any
([
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
)
...
...
@@ -637,7 +651,7 @@ def local_gpu_solve(node):
x
,
y
=
node
.
inputs
return
[
host_from_gpu
(
gpu_solve
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
)))]
as_cuda_ndarray_variable
(
y
)))]
return
False
...
...
@@ -715,8 +729,7 @@ def local_gpu_ger(node):
as_cuda_ndarray_variable
(
z
),
a
,
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
)
))]
as_cuda_ndarray_variable
(
y
)))]
return
False
...
...
@@ -745,11 +758,12 @@ def local_gpu_gemm(node):
y_on_gpu
=
(
y
.
owner
and
isinstance
(
y
.
owner
.
op
,
HostFromGpu
))
z_on_gpu
=
(
z
.
owner
and
isinstance
(
z
.
owner
.
op
,
HostFromGpu
))
if
x_on_gpu
or
y_on_gpu
or
z_on_gpu
:
return
[
host_from_gpu
(
gpu_gemm_no_inplace
(
gpu_from_host
(
z
),
a
,
gpu_from_host
(
x
),
gpu_from_host
(
y
),
b
))]
return
[
host_from_gpu
(
gpu_gemm_no_inplace
(
as_cuda_ndarray_variable
(
z
),
a
,
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
),
b
))]
return
False
...
...
@@ -886,8 +900,8 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scal
.
basic
.
Sqr
)
):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scal
.
basic
.
Sqr
)
):
op
=
node
.
op
inp
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
return
[
GpuCAReduce
(
op
.
reduce_mask
,
op
.
scalar_op
,
scal
.
basic
.
sqr
)(
inp
)]
...
...
@@ -902,7 +916,8 @@ def local_gpu_reshape(node):
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Reshape
):
rshp
=
host_input
.
owner
.
op
x
,
shp
=
host_input
.
owner
.
inputs
gpu_reshape
=
GpuReshape
(
rshp
.
ndim
)(
as_cuda_ndarray_variable
(
x
),
shp
)
gpu_reshape
=
GpuReshape
(
rshp
.
ndim
)(
as_cuda_ndarray_variable
(
x
),
shp
)
if
gpu_reshape
.
broadcastable
!=
node
.
outputs
[
0
]
.
broadcastable
:
# this can happen as we always return False for all broadcast
# dim in GpuReshape but not for Reshape
...
...
@@ -961,23 +976,27 @@ def local_gpu_subtensor(node):
# to the GPU in that case.
return
coords
=
host_input
.
owner
.
inputs
[
1
:]
return
[
GpuSubtensor
(
subt
.
idx_list
)(
as_cuda_ndarray_variable
(
x
),
*
coords
)]
return
[
GpuSubtensor
(
subt
.
idx_list
)(
as_cuda_ndarray_variable
(
x
),
*
coords
)]
if
isinstance
(
node
.
op
,
tensor
.
Subtensor
):
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)
and
x
.
dtype
==
"float32"
):
gpu_x
=
x
.
owner
.
inputs
[
0
]
if
(
gpu_x
.
owner
and
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)
and
# And it is a shared var or an input of the graph.
not
gpu_x
.
owner
.
inputs
[
0
]
.
owner
):
if
len
(
x
.
clients
)
==
1
:
if
any
([
n
==
'output'
or
isinstance
(
n
.
op
,
GpuOp
)
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
return
else
:
return
[
host_from_gpu
(
as_cuda_ndarray_variable
(
node
.
outputs
[
0
]))]
return
[
host_from_gpu
(
as_cuda_ndarray_variable
(
node
.
outputs
[
0
]))]
return
gpu_x
,
=
x
.
owner
.
inputs
...
...
@@ -996,7 +1015,8 @@ def local_gpu_advanced_subtensor1(node):
host_input
.
owner
.
op
.
__class__
is
tensor
.
AdvancedSubtensor1
:
x
=
host_input
.
owner
.
inputs
[
0
]
coords
=
host_input
.
owner
.
inputs
[
1
:]
return
[
GpuAdvancedSubtensor1
()(
as_cuda_ndarray_variable
(
x
),
*
coords
)]
return
[
GpuAdvancedSubtensor1
()(
as_cuda_ndarray_variable
(
x
),
*
coords
)]
if
node
.
op
.
__class__
is
tensor
.
AdvancedSubtensor1
:
x
=
node
.
inputs
[
0
]
coords
=
node
.
inputs
[
1
:]
...
...
@@ -1032,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node):
if
(
compute_capability
<
2
or
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
else
:
gpu_op
=
GpuAdvancedIncSubtensor1_dev20
(
set_instead_of_inc
=
set_instead_of_inc
)
return
[
gpu_op
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
),
*
coords
)]
return
[
gpu_op
(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
y
),
*
coords
)]
# Should not execute for GpuAdvancedIncSubtensor1
if
(
node
.
op
.
__class__
is
tensor
.
AdvancedIncSubtensor1
and
...
...
@@ -1188,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node):
nb_monitored_vars
=
len
(
node
.
outputs
)
for
i
in
range
(
nb_monitored_vars
):
inp
=
old_inputs
[
i
+
1
]
inp
=
old_inputs
[
i
+
1
]
out
=
old_outputs
[
i
]
input_is_from_gpu
=
(
inp
.
owner
and
...
...
@@ -1253,18 +1275,17 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
# thing if we want, since this gpu op will cast to integers
# internally anyway
int_cast_ops
=
(
tensor
.
basic
.
_convert_to_int32
,
tensor
.
basic
.
_convert_to_int8
,
tensor
.
basic
.
_convert_to_int16
,
tensor
.
basic
.
_convert_to_int64
,
)
tensor
.
basic
.
_convert_to_int32
,
tensor
.
basic
.
_convert_to_int8
,
tensor
.
basic
.
_convert_to_int16
,
tensor
.
basic
.
_convert_to_int64
)
while
y
.
owner
and
y
.
owner
.
op
in
int_cast_ops
:
y
=
y
.
owner
.
inputs
[
0
]
gpu_nll
,
gpu_sm
,
gpu_am
=
\
GpuCrossentropySoftmaxArgmax1HotWithBias
()(
gpu_x
,
as_cuda_ndarray_variable
(
b
),
as_cuda_ndarray_variable
(
cast
(
y
,
'float32'
)))
GpuCrossentropySoftmaxArgmax1HotWithBias
()(
gpu_x
,
as_cuda_ndarray_variable
(
b
),
as_cuda_ndarray_variable
(
cast
(
y
,
'float32'
)))
am_dtype
=
node
.
outputs
[
2
]
.
type
.
dtype
return
[
host_from_gpu
(
gpu_nll
),
host_from_gpu
(
gpu_sm
),
...
...
@@ -1307,7 +1328,8 @@ def local_gpu_softmax_with_bias(node):
x_on_gpu
=
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)
b_on_gpu
=
b
.
owner
and
isinstance
(
b
.
owner
.
op
,
HostFromGpu
)
if
x_on_gpu
or
b_on_gpu
:
gpu_sm
=
GpuSoftmaxWithBias
()(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
b
))
gpu_sm
=
GpuSoftmaxWithBias
()(
as_cuda_ndarray_variable
(
x
),
as_cuda_ndarray_variable
(
b
))
return
[
host_from_gpu
(
gpu_sm
)]
return
False
...
...
@@ -1324,6 +1346,7 @@ def _gpu_conv_to_fftconv(node):
if
(
node
.
op
.
imshp
is
not
None
and
node
.
op
.
imshp
[
-
1
]
is
not
None
and
node
.
op
.
imshp
[
-
1
]
%
2
==
1
):
kwargs
[
'pad_last_dim'
]
=
True
# If the user supplied the full nonsymbolic image_shape and
# filter_shape in conv2d(), we can pass it on to conv2d_fft().
...
...
@@ -1337,7 +1360,8 @@ def _gpu_conv_to_fftconv(node):
(
node
.
op
.
nkern
is
not
None
)
and
(
len
(
node
.
op
.
imshp
)
==
3
)
and
(
node
.
op
.
imshp
[
0
]
is
not
None
)):
kwargs
[
'filter_shape'
]
=
(
node
.
op
.
nkern
,
node
.
op
.
imshp
[
0
])
+
node
.
op
.
kshp
kwargs
[
'filter_shape'
]
=
(
node
.
op
.
nkern
,
node
.
op
.
imshp
[
0
])
+
\
node
.
op
.
kshp
rval
=
conv2d_fft
(
node
.
inputs
[
0
],
node
.
inputs
[
1
],
**
kwargs
)
if
node
.
outputs
[
0
]
.
broadcastable
!=
rval
.
broadcastable
:
# With given shape information, conv2d_fft may return a different
...
...
@@ -1353,6 +1377,7 @@ def local_conv_fft_valid(node):
if
(
node
.
op
.
border_mode
==
'valid'
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
fft_opt
):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
False
...
...
@@ -1363,6 +1388,7 @@ def local_conv_fft_full(node):
if
(
node
.
op
.
border_mode
==
'full'
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
fft_opt
):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
...
...
@@ -1476,6 +1502,7 @@ def local_gpu_conv(node):
def
local_conv_gemm
(
node
):
if
(
isinstance
(
node
.
op
,
GpuConv
)
and
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]):
img
,
kern
=
node
.
inputs
border_mode
=
node
.
op
.
border_mode
subsample
=
node
.
op
.
subsample
...
...
@@ -1499,7 +1526,7 @@ def local_conv_gemm(node):
# we know the kernel and output size
prod1
=
node
.
op
.
kshp
[
0
]
*
node
.
op
.
kshp
[
1
]
prod2
=
((
node
.
op
.
imshp
[
-
2
]
-
node
.
op
.
kshp
[
0
]
+
1
)
*
(
node
.
op
.
imshp
[
-
1
]
-
node
.
op
.
kshp
[
1
]
+
1
))
(
node
.
op
.
imshp
[
-
1
]
-
node
.
op
.
kshp
[
1
]
+
1
))
if
((
node
.
op
.
bsize
is
not
None
)
and
(
len
(
node
.
op
.
imshp
)
==
3
)
and
(
node
.
op
.
imshp
[
0
]
is
not
None
)):
...
...
@@ -1521,7 +1548,7 @@ def local_conv_gemm(node):
kern
=
kern
.
dimshuffle
(
1
,
0
,
2
,
3
)
# call GpuCorrMM_gradInputs
rval
=
GpuCorrMM_gradInputs
(
'valid'
,
subsample
)(
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
))
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
))
if
node
.
outputs
[
0
]
.
broadcastable
!=
rval
.
broadcastable
:
# With given shape information, conv2d_fft may return a different
# broadcast pattern than GpuConv. This is forbidden, so we fix it.
...
...
@@ -1599,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
if
((
var
in
inputs
)
and
(
shape
is
not
None
)
and
not
any
(
s
is
None
for
s
in
shape
)):
result
[
var
]
=
theano
.
shared
(
# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts
# var.type.filter(numpy.random.randn(*shape),
# allow_downcast=True),
# TODO: Use var.type.filter when cuda_ndarray.filter
# supports non-strict casts
# var.type.filter(numpy.random.randn(*shape),
# allow_downcast=True),
numpy
.
require
(
numpy
.
random
.
randn
(
*
shape
),
dtype
=
var
.
dtype
),
var
.
name
,
...
...
@@ -1613,10 +1642,11 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
# We just register all optimizers from conv_groupopt with the metaoptimizer
conv_metaopt
=
ConvMetaOptimizer
(
conv_groupopt
.
query
(
*
[
'+'
+
name
for
name
in
conv_groupopt
.
_names
])
.
opts
)
conv_groupopt
.
query
(
*
[
'+'
+
name
for
name
in
conv_groupopt
.
_names
])
.
opts
)
# Then we add some optimizers that try less obvious options
conv_metaopt
.
register
(
dnn
.
local_conv_dnn_alternative
)
# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt
# Finally, we register the metaoptimizer as the first optimizer in
# conv_groupopt
conv_groupopt
.
register
(
'conv_meta'
,
conv_metaopt
,
0
)
...
...
@@ -1661,6 +1691,7 @@ def local_convgrad3d_fft(node):
return
False
if
(
isinstance
(
node
.
op
,
ConvGrad3D
)
and
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
# we import conv3d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
# Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
...
...
@@ -1747,8 +1778,8 @@ def local_convgrad3d_gemm(node):
f
=
node
.
inputs
[
3
]
f
=
gpu_contiguous
(
f
.
dimshuffle
(
0
,
4
,
1
,
2
,
3
))
rval
=
GpuCorr3dMM_gradWeights
(
subsample
=
(
sx
,
sy
,
sz
))(
x
,
f
,
shape
=
node
.
inputs
[
2
][
1
:
4
])
rval
=
GpuCorr3dMM_gradWeights
(
subsample
=
(
sx
,
sy
,
sz
))(
x
,
f
,
shape
=
node
.
inputs
[
2
][
1
:
4
])
# Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
return
[
rval
.
dimshuffle
(
0
,
2
,
3
,
4
,
1
)]
...
...
@@ -1770,7 +1801,8 @@ def local_convtransp3d_gemm(node):
# Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
f
=
node
.
inputs
[
3
]
f
=
gpu_contiguous
(
f
.
dimshuffle
(
0
,
4
,
1
,
2
,
3
))
rval
=
GpuCorr3dMM_gradInputs
(
subsample
=
(
sx
,
sy
,
sz
))(
kern
=
x
,
topgrad
=
f
)
rval
=
GpuCorr3dMM_gradInputs
(
subsample
=
(
sx
,
sy
,
sz
))(
kern
=
x
,
topgrad
=
f
)
# Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
return
[
rval
.
dimshuffle
(
0
,
2
,
3
,
4
,
1
)
+
node
.
inputs
[
1
]]
...
...
@@ -1786,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample
def
local_gpu_downsample_factor_max
(
node
):
if
(
isinstance
(
node
.
op
,
downsample
.
DownsampleFactorMax
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
if
node
.
op
.
padding
!=
(
0
,
0
)
or
node
.
op
.
mode
!=
'max'
:
...
...
@@ -1801,11 +1834,13 @@ def local_gpu_downsample_factor_max(node):
def
local_gpu_downsample_factor_max_grad
(
node
):
if
(
isinstance
(
node
.
op
,
downsample
.
MaxPoolGrad
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
if
(
node
.
op
.
padding
!=
(
0
,
0
)
or
node
.
op
.
mode
!=
'max'
or
node
.
op
.
st
!=
node
.
op
.
ds
):
return
x
,
z
,
gz
=
node
.
inputs
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
...
...
@@ -1876,7 +1911,8 @@ def local_gpu_join(node):
# print "OPT: axis_and_tensors=", axis_and_tensors
matches
=
[(
not
t
.
owner
is
None
and
isinstance
(
t
.
owner
.
op
,
HostFromGpu
))
or
matches
=
[(
t
.
owner
is
not
None
and
isinstance
(
t
.
owner
.
op
,
HostFromGpu
))
or
isinstance
(
t
,
gof
.
Constant
)
for
t
in
axis_and_tensors
[
1
:]]
# print "OPT: matches =", matches
...
...
@@ -1884,7 +1920,8 @@ def local_gpu_join(node):
if
all
(
matches
):
# the extra gpu_from_host introduced here will
# be removed by further optimizations
new_tensors
=
[
as_cuda_ndarray_variable
(
t
)
for
t
in
axis_and_tensors
[
1
:]]
new_tensors
=
[
as_cuda_ndarray_variable
(
t
)
for
t
in
axis_and_tensors
[
1
:]]
new_a_and_t
=
[
axis_and_tensors
[
0
]]
+
new_tensors
replacement_node
=
host_from_gpu
(
gpu_join
(
*
new_a_and_t
))
...
...
@@ -1941,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt',
def
get_device_type_sizes
():
"""
Returns
-------
tuple
...
...
@@ -1962,7 +1998,8 @@ def get_device_type_sizes():
del
gpu_int_size
del
t
except
Exception
as
e
:
_logger
.
warning
((
"Optimization Warning: "
_logger
.
warning
((
"Optimization Warning: "
"Got the following error, but you can ignore it. "
"This could cause less GpuElemwise fused together.
\n
"
"
%
s"
)
%
e
)
...
...
@@ -1997,7 +2034,7 @@ def max_inputs_to_GpuElemwise(node):
size_param_mandatory
=
int_size
# for numels
size_param_mandatory
+=
int_size
*
ndim
# for the shape
size_param_mandatory
+=
sum
((
gpu_ptr_size
+
int_size
*
ndim
)
for
i
in
node
.
outputs
)
for
i
in
node
.
outputs
)
nb_bytes_avail
=
argument_limit
-
size_param_mandatory
nb_bytes_per_inputs
=
(
ndim
*
int_size
)
+
gpu_ptr_size
...
...
@@ -2037,11 +2074,11 @@ def split_huge_add_or_mul(node):
# GpuElemwise fusion
gpu_local_elemwise_fusion
=
tensor
.
opt
.
local_elemwise_fusion_op
(
GpuElemwise
,
max_inputs_to_GpuElemwise
)
GpuElemwise
,
max_inputs_to_GpuElemwise
)
if
config
.
gpu
.
local_elemwise_fusion
:
_logger
.
debug
(
"enabling optimization fusion of gpu elemwise in fast_run"
)
# Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
# Must be after cpu fusion at 40, gpu at 48.5 and before
# AddDestroyHandler at 49.5
optdb
.
register
(
'gpu_elemwise_fusion'
,
tensor
.
opt
.
FusionOptimizer
(
gpu_local_elemwise_fusion
),
49
,
'fast_run'
,
'fusion'
,
...
...
@@ -2055,7 +2092,7 @@ else:
# GpuElemwise inplace
gpu_inplace_elemwise_optimizer
=
tensor
.
opt
.
inplace_elemwise_optimizer_op
(
GpuElemwise
)
GpuElemwise
)
# DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
# It still will be run in fast_run with device=gpu with the current tag.
optdb
.
register
(
'gpu_inplace_elemwise_opt'
,
gpu_inplace_elemwise_optimizer
,
75
,
...
...
@@ -2069,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
tensor
.
opt
.
local_elemwise_alloc_op
(
GpuElemwise
,
GpuAlloc
,
GpuDimShuffle
)
)
register_opt
()(
gpu_elemwise_alloc
)
register_opt
()(
tensor
.
opt
.
local_useless_elemwise
)
# needed by gpu_elemwise_alloc
# needed by gpu_elemwise_alloc
register_opt
()(
tensor
.
opt
.
local_useless_elemwise
)
tensor
.
opt
.
register_specialize_device
(
gpu_elemwise_alloc
)
...
...
@@ -2115,8 +2153,7 @@ def local_gpualloc(node):
new_out
.
type
.
broadcastable
):
assert
b_new
or
(
not
b_old
)
new_out
=
tensor
.
patternbroadcast
(
new_out
,
old_out
.
broadcastable
)
# if old_out.type != new_out.type:
#import pdb; pdb.set_trace()
return
[
new_out
]
...
...
@@ -2139,12 +2176,14 @@ def local_gpualloc_memset_0(node):
if
(
isinstance
(
inp
,
CudaNdarrayConstant
)
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_out
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
old_bcast
=
node
.
outputs
[
0
]
.
type
.
broadcastable
if
new_out
.
type
.
broadcastable
!=
old_bcast
:
# check that we did not try discarding a broadcastable dimension
assert
not
any
(
b_old
and
not
b_new
for
b_old
,
b_new
in
zip
(
old_bcast
,
new_out
.
type
.
broadcastable
))
# check that we did not try discarding a broadcastable
# dimension
assert
not
any
(
b_old
and
not
b_new
for
b_old
,
b_new
in
zip
(
old_bcast
,
new_out
.
type
.
broadcastable
))
# force old broadcasting pattern; we must not change it here
new_out
=
tensor
.
patternbroadcast
(
new_out
,
old_bcast
)
return
[
new_out
]
...
...
@@ -2177,6 +2216,7 @@ def local_gpu_eye(node):
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
host_input
.
owner
.
op
.
dtype
==
"float32"
):
return
[
gpu_eye
(
*
host_input
.
owner
.
inputs
)]
if
isinstance
(
node
.
op
,
tensor
.
Eye
)
and
node
.
op
.
dtype
==
"float32"
:
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
...
...
@@ -2188,6 +2228,7 @@ def local_gpu_eye(node):
def
safe_to_gpu
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
return
as_cuda_ndarray_variable
(
x
)
else
:
return
x
...
...
@@ -2242,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def
tensor_to_cuda
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
y
=
CudaNdarrayType
(
broadcastable
=
x
.
type
.
broadcastable
)()
if
x
.
name
:
y
.
name
=
x
.
name
+
'[cuda]'
...
...
@@ -2264,7 +2306,8 @@ def local_gpu_extract_diagonal(node):
theano
.
tensor
.
TensorType
)):
inp
=
node
.
inputs
[
0
]
if
inp
.
owner
and
isinstance
(
inp
.
owner
.
op
,
HostFromGpu
):
return
[
host_from_gpu
(
nlinalg
.
extract_diag
(
as_cuda_ndarray_variable
(
inp
)))]
return
[
host_from_gpu
(
nlinalg
.
extract_diag
(
as_cuda_ndarray_variable
(
inp
)))]
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
if
(
host_input
.
owner
and
...
...
@@ -2300,6 +2343,7 @@ def gpuScanOptimization(node):
isinstance
(
host_input
.
owner
.
op
,
scan_op
.
Scan
)
and
not
host_input
.
owner
.
op
.
info
[
'gpu'
]
and
len
(
host_input
.
owner
.
outputs
)
==
1
):
# Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one
# output that corresponds to the input of ``node``
...
...
@@ -2353,6 +2397,7 @@ def gpuScanOptimization(node):
# scan(host_from_gpu) -> host_from_gpu(GPUscan)
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
and
not
node
.
op
.
info
[
'gpu'
]):
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
for
i
in
node
.
inputs
]):
...
...
@@ -2434,7 +2479,8 @@ optdb.register('gpu_scanOp_make_inplace',
# @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
# GpuElemwise{mul}(lr, GpuSparseBlockOuter) -> GpuSparseBlockOuter(..., alpha=lr)
# GpuElemwise{mul}(lr, GpuSparseBlockOuter) ->
# GpuSparseBlockOuter(..., alpha=lr)
# """
# return [gpu_sparse_block_outer(*inputs)]
...
...
@@ -2465,8 +2511,7 @@ def _clear_host_from_gpu(inputs):
return
clean_inputs
@register_meta_opt
(
SparseBlockGemv
,
[
"gpu_opt"
,
"gpu_local_optimizations"
],
0.
,
'fast_run'
,
'fast_compile'
,
'gpu'
)
@register_opt
()
@local_optimizer
([
SparseBlockGemv
,
GpuFromHost
])
def
gpu_sparse_block_gemv_opt
(
node
):
"""
...
...
@@ -2493,8 +2538,7 @@ def gpu_sparse_block_gemv_opt(node):
return
[
GpuSparseBlockGemv
(
meta_node
.
op
.
inplace
)(
*
inputs
)]
@register_meta_opt
(
SparseBlockOuter
,
[
"gpu_opt"
,
"gpu_local_optimizations"
],
0.
,
'fast_run'
,
'fast_compile'
,
'gpu'
)
@register_opt
()
@local_optimizer
([
SparseBlockOuter
,
GpuFromHost
])
def
gpu_sparse_block_outer_opt
(
node
):
"""
...
...
@@ -2522,4 +2566,36 @@ def gpu_sparse_block_outer_opt(node):
return
[
GpuSparseBlockOuter
(
meta_node
.
op
.
inplace
)(
*
inputs
)]
@local_optimizer
([
GpuSparseBlockGemv
],
inplace
=
True
)
def
local_inplace_gpu_sparse_block_gemv
(
node
):
"""
GpuSparseBlockGemv(inplace=False) -> GpuSparseBlockGemv(inplace=True)
"""
if
isinstance
(
node
.
op
,
GpuSparseBlockGemv
)
and
not
node
.
op
.
inplace
:
new_node
=
gpu_sparse_block_gemv_inplace
(
*
node
.
inputs
)
return
[
new_node
]
return
False
compile
.
optdb
.
register
(
'local_inplace_gpu_sparse_block_gemv'
,
TopoOptimizer
(
local_inplace_gpu_sparse_block_gemv
,
failure_callback
=
TopoOptimizer
.
warn_inplace
),
60
,
'fast_run'
,
'inplace'
,
'gpu'
)
# DEBUG
@local_optimizer
([
GpuSparseBlockOuter
],
inplace
=
True
)
def
local_inplace_gpu_sparse_block_outer
(
node
):
"""
GpuSparseBlockOuter(inplace=False) -> GpuSparseBlockOuter(inplace=True)
"""
if
isinstance
(
node
.
op
,
GpuSparseBlockOuter
)
and
not
node
.
op
.
inplace
:
new_node
=
gpu_sparse_block_outer_inplace
(
*
node
.
inputs
)
return
[
new_node
]
return
False
compile
.
optdb
.
register
(
'local_inplace_gpu_sparse_block_outer'
,
TopoOptimizer
(
local_inplace_gpu_sparse_block_outer
,
failure_callback
=
TopoOptimizer
.
warn_inplace
),
60
,
'fast_run'
,
'inplace'
,
'gpu'
)
# DEBUG
import
theano.sandbox.cuda.extra_ops
theano/sandbox/opt.py
浏览文件 @
aeb8c035
...
...
@@ -2,170 +2,42 @@
Optimizations addressing the ops in sandbox root directory
"""
import
bisect
import
logging
from
theano.compile
import
optdb
from
theano.gof
import
local_optimizer
,
EquilibriumDB
from
theano.tensor.opt
import
register_specialize
from
theano
import
compile
# to register the optimizer built by this file
from
theano
import
gof
from
theano.sandbox.blocksparse
import
(
SparseBlockGemv
,
SparseBlockOuter
,
sparse_block_gemv
,
sparse_block_outer
,
sparse_block_gemv_inplace
,
sparse_block_outer_inplace
,
CpuSparseBlockGemv
,
CpuSparseBlockOuter
)
_logger
=
logging
.
getLogger
(
'theano.sandbox.opt'
)
def
_db_exists
(
db
,
db_name
):
"""
Tests whether the full path from `db_name[0]` down to
`db_name[-1]` exists.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interest where to register.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if
len
(
db_name
)
==
1
:
return
db_name
[
0
]
in
db
.
_names
return
db_name
[
0
]
in
db
.
_names
and
_db_exists
(
db
[
db_name
[
0
]],
db_name
[
1
:])
def
_db_register
(
db
,
db_name
,
*
args
):
"""
Registers an object in last datasets given in db_name. `db_name[-1]`
is deep in the hierarchy of `db`.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interest where to register.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if
len
(
db_name
)
==
0
:
return
db
.
register
(
*
args
)
return
_db_register
(
db
[
db_name
[
0
]],
db_name
[
1
:],
*
args
)
def
_db_positions
(
db
,
db_name
,
positions
=
()):
"""
Returns the list of positions of all databases from `db_name[0]`
down to `db_name[-1]`. The path is hierarchical, hence `db_name[0]`
is in `db`, `db_name[1]` is in `db[db_name[0]]`, etc.
Parameters
----------
db: `theano.gof.optdb.DB`
A dataset of optimisations or sub-datasets.
db_name: list or tuple of strings
Names of datasets from given one `db[db_name[0]]` down
to the dataset of interests.
ex: ['level_1_dataset', 'level_2_dataset']
"""
if
len
(
db_name
)
==
0
:
return
positions
db_position
=
db
.
__position__
.
get
(
db_name
[
0
],
0.
)
return
_db_positions
(
db
[
db_name
[
0
]],
db_name
[
1
:],
positions
+
(
db_position
,
))
def
register_meta_opt
(
op_class
,
db_name
,
position
,
*
args
):
"""
Registers a given optimization under given database name and saves
optimization information in `op_class.registered_opts`.
Parameters
----------
op_class: `theano.gof.Op`
A meta Op which have multiple implementations available
for optimization.
db_name: string, list or tuple of strings
A string if optimization is inserted in `theano.compile.optdb`
directly. List is used to insert an optimization deep inside a
hierarchy of optimization databases.
position: int or float
Position of the optimisation in the target dataset.
(Position in deep database if not optdb)
*args
Arguments to register the optimization.
"""
if
isinstance
(
db_name
,
str
):
db_name
=
[
db_name
]
def
call
(
local_meta_opt
):
if
not
_db_exists
(
optdb
,
db_name
):
# TODO: Would another default DB be better?
_db_register
(
optdb
,
db_name
[:
-
2
],
db_name
[
-
1
],
EquilibriumDB
(),
position
,
*
args
)
_db_register
(
optdb
,
db_name
,
local_meta_opt
.
__name__
,
local_meta_opt
,
*
args
)
positions
=
_db_positions
(
optdb
,
db_name
)
idx
=
bisect
.
bisect_left
((
positions
,
local_meta_opt
),
op_class
.
registered_opts
)
op_class
.
registered_opts
.
insert
(
idx
,
(
positions
,
local_meta_opt
.
__name__
))
return
local_meta_opt
return
call
@register_meta_opt
(
SparseBlockGemv
,
[
"meta_cpu"
],
51.0
,
"fast_run"
,
"fast_compile"
)
@local_optimizer
([
SparseBlockGemv
])
def
cpu_sparse_block_gemv_opt
(
node
):
"""
SparseBlockGemv -> CpuSparseBlockGemv
"""
return
[
CpuSparseBlockGemv
(
node
.
op
.
inplace
)(
*
node
.
inputs
)]
@register_meta_opt
(
SparseBlockOuter
,
[
"meta_cpu"
],
51.0
,
"fast_run"
,
"fast_compile"
)
@local_optimizer
([
SparseBlockOuter
])
def
cpu_sparse_block_outer_opt
(
node
):
"""
SparseBlockOuter -> CpuSparseBlockOuter
"""
return
[
CpuSparseBlockOuter
(
node
.
op
.
inplace
)(
*
node
.
inputs
)]
sparse_block_outer_inplace
)
@register_specialize
@local_optimizer
([
sparse_block_gemv
],
inplace
=
True
)
def
local_inplace_block_sparse_gemv
(
node
):
@gof.local_optimizer
([
SparseBlockGemv
],
inplace
=
True
)
def
local_inplace_sparse_block_gemv
(
node
):
"""
SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
"""
return
[
sparse_block_gemv_inplace
(
*
node
.
inputs
)]
if
isinstance
(
node
.
op
,
SparseBlockGemv
)
and
not
node
.
op
.
inplace
:
new_node
=
sparse_block_gemv_inplace
(
*
node
.
inputs
)
return
[
new_node
]
return
False
compile
.
optdb
.
register
(
'local_inplace_sparse_block_gemv'
,
gof
.
TopoOptimizer
(
local_inplace_sparse_block_gemv
,
failure_callback
=
gof
.
TopoOptimizer
.
warn_inplace
),
60
,
'fast_run'
,
'inplace'
)
# DEBUG
@register_specialize
@local_optimizer
([
sparse_block_outer
],
inplace
=
True
)
def
local_inplace_block_sparse_outer
(
node
):
@gof.local_optimizer
([
SparseBlockOuter
],
inplace
=
True
)
def
local_inplace_sparse_block_outer
(
node
):
"""
SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
"""
return
[
sparse_block_outer_inplace
(
*
node
.
inputs
)]
if
isinstance
(
node
.
op
,
SparseBlockOuter
)
and
not
node
.
op
.
inplace
:
new_node
=
sparse_block_outer_inplace
(
*
node
.
inputs
)
return
[
new_node
]
return
False
compile
.
optdb
.
register
(
'local_inplace_sparse_block_outer'
,
gof
.
TopoOptimizer
(
local_inplace_sparse_block_outer
,
failure_callback
=
gof
.
TopoOptimizer
.
warn_inplace
),
60
,
'fast_run'
,
'inplace'
)
# DEBUG
theano/sandbox/tests/test_blocksparse.py
浏览文件 @
aeb8c035
...
...
@@ -11,7 +11,7 @@ from theano import tensor
import
theano.tests.unittest_tools
as
utt
from
theano.sandbox.blocksparse
import
sparse_block_dot
,
\
cpu_sparse_block_gemv
,
cpu_
sparse_block_outer
sparse_block_gemv
,
sparse_block_outer
class
BlockSparse_Gemv_and_Outer
(
unittest
.
TestCase
):
...
...
@@ -24,8 +24,8 @@ class BlockSparse_Gemv_and_Outer(unittest.TestCase):
self
.
mode
=
theano
.
compile
.
get_default_mode
()
.
excluding
(
'constant_folding'
)
self
.
gemv_op
=
cpu_
sparse_block_gemv
self
.
outer_op
=
cpu_
sparse_block_outer
self
.
gemv_op
=
sparse_block_gemv
self
.
outer_op
=
sparse_block_outer
@staticmethod
def
gemv_data
():
...
...
theano/sandbox/tests/test_opt.py
浏览文件 @
aeb8c035
import
theano
from
theano
import
tensor
from
theano.sandbox.blocksparse
import
CpuSparseBlockGemv
,
\
CpuSparseBlockOuter
,
sparse_block_dot
from
theano.sandbox.blocksparse
import
sparse_block_dot
def
test_blocksparse_
cpu
_gemv_opt
():
def
test_blocksparse_
inplace
_gemv_opt
():
b
=
tensor
.
fmatrix
()
W
=
tensor
.
ftensor4
()
h
=
tensor
.
ftensor3
()
...
...
@@ -15,10 +14,13 @@ def test_blocksparse_cpu_gemv_opt():
f
=
theano
.
function
([
W
,
h
,
iIdx
,
b
,
oIdx
],
o
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
,
CpuSparseBlockGemv
)
if
theano
.
config
.
mode
==
"FAST_COMPILE"
:
assert
not
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
.
inplace
else
:
assert
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
.
inplace
def
test_blocksparse_
cpu
_outer_opt
():
def
test_blocksparse_
inplace
_outer_opt
():
b
=
tensor
.
fmatrix
()
W
=
tensor
.
ftensor4
()
h
=
tensor
.
ftensor3
()
...
...
@@ -32,4 +34,7 @@ def test_blocksparse_cpu_outer_opt():
f
=
theano
.
function
([
W
,
h
,
iIdx
,
b
,
oIdx
],
[
o
,
tensor
.
grad
(
o
.
sum
(),
wrt
=
W
)])
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
,
CpuSparseBlockOuter
)
if
theano
.
config
.
mode
==
"FAST_COMPILE"
:
assert
not
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
.
inplace
else
:
assert
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
.
inplace
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论