Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
54fe4a7f
提交
54fe4a7f
authored
3月 14, 2016
作者:
Chiheb Trabelsi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
opt.py has been modified in order to respect the flake8 style.
上级
1a3948cc
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
61 行增加
和
81 行删除
+61
-81
opt.py
theano/sandbox/cuda/opt.py
+61
-81
没有找到文件。
theano/sandbox/cuda/opt.py
浏览文件 @
54fe4a7f
...
...
@@ -10,22 +10,32 @@ import warnings
import
numpy
from
six.moves
import
reduce
,
xrange
from
.
import
dnn
import
theano
from
theano
import
scalar
as
scal
from
theano
import
config
,
tensor
,
gof
import
theano.ifelse
import
theano.tensor.signal.pool
import
theano.tensor.nnet
import
theano.tensor.nnet.neighbours
# Convolution
from
theano.tensor.nnet
import
conv
from
theano.tensor.nnet.ConvGrad3D
import
ConvGrad3D
from
theano.tensor.nnet.ConvTransp3D
import
ConvTransp3D
# Pooling
import
theano.tensor.signal.pool
as
pool
from
theano.compile
import
optdb
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
ProxyDB
,
Optimizer
,
TopoOptimizer
,
toolbox
)
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.sandbox.cuda.basic_ops
import
gpu_join
,
GpuJoin
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda.basic_ops
import
(
gpu_eye
,
gpu_contiguous
,
gpu_from_host
,
host_from_gpu
,
GpuFromHost
,
HostFromGpu
,
GpuContiguous
,
GpuElemwise
,
GpuDimShuffle
,
GpuReshape
,
GpuCAReduce
,
GpuFlatten
,
gpu_flatten
,
gpu_flatten
,
GpuSubtensor
,
GpuAdvancedSubtensor1
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1_dev20
,
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
,
GpuSplit
,
GpuAllocEmpty
)
...
...
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
# This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date.
import
theano.tensor.signal.pool
import
theano.tensor.nnet.neighbours
cpu_ops_moved_to_gpu
=
[
tensor
.
blas
.
Dot22
,
tensor
.
blas
.
Dot22Scalar
,
tensor
.
blas
.
Gemm
,
tensor
.
blas
.
Gemv
,
tensor
.
blas
.
Ger
,
tensor
.
nnet
.
conv
.
ConvOp
,
...
...
@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node):
if
y
.
ndim
==
2
:
y_
=
y_
.
dimshuffle
(
0
,
1
,
"x"
)
z
=
GpuBatchedDot
()(
as_cuda_ndarray_variable
(
x_
),
as_cuda_ndarray_variable
(
y_
))
as_cuda_ndarray_variable
(
y_
))
# unpad z shape
if
x
.
ndim
==
2
:
z
=
z
.
dimshuffle
(
0
,
*
range
(
2
,
z
.
ndim
))
...
...
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
if
x
.
type
==
node
.
outputs
[
0
]
.
type
:
return
[
x
]
elif
(
all
([
c
!=
"output"
and
isinstance
(
c
.
op
,
GpuFromHost
)
for
c
,
i
in
node
.
outputs
[
0
]
.
clients
])
and
x
.
owner
and
x
.
owner
.
op
.
__class__
in
for
c
,
i
in
node
.
outputs
[
0
]
.
clients
])
and
x
.
owner
and
x
.
owner
.
op
.
__class__
in
cpu_ops_moved_to_gpu
):
# It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the
...
...
@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scal
.
basic
.
Sqr
)):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scal
.
basic
.
Sqr
)):
op
=
node
.
op
inp
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
...
...
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
return
[
gpu_flatten
(
host_input
.
owner
.
inputs
[
0
],
outdim
)(
as_cuda_ndarray_variable
(
host_input
.
owner
.
inputs
[
0
]))]
if
isinstance
(
node
.
op
,
tensor
.
Flatten
):
x
,
=
node
.
inputs
x
,
shp
=
node
.
inputs
outdim
=
node
.
op
.
outdim
if
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
outdim
=
node
.
op
.
outdim
gpu_x
,
=
x
.
owner
.
inputs
...
...
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
*
coords
)]
if
isinstance
(
node
.
op
,
tensor
.
Subtensor
):
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)
and
x
.
dtype
==
"float32"
):
if
(
x
.
owner
and
x
.
dtype
==
"float32"
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
gpu_x
=
x
.
owner
.
inputs
[
0
]
if
(
gpu_x
.
owner
and
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)
and
# And it is a shared var or an input of the graph.
not
gpu_x
.
owner
.
inputs
[
0
]
.
owner
):
if
(
gpu_x
.
owner
and
# And it is a shared var or an input of the graph.
not
(
gpu_x
.
owner
.
inputs
[
0
]
.
owner
)
and
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)):
if
len
(
x
.
clients
)
==
1
:
if
any
([
n
==
'output'
or
isinstance
(
n
.
op
,
GpuOp
)
...
...
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
'least
\'
0.6
\'
.'
,
stacklevel
=
1
)
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
if
(
compute_capability
<
2
or
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
if
(
compute_capability
<
2
or
y
.
ndim
!=
2
or
x
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
...
...
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
if
(
compute_capability
<
2
or
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
if
(
compute_capability
<
2
or
y
.
ndim
!=
2
or
x
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
else
:
...
...
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
# Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast
# y to put it on GPU
elif
type
(
node
.
op
)
==
tensor
.
IncSubtensor
and
\
node
.
inputs
[
0
]
.
dtype
==
"float32"
:
elif
(
type
(
node
.
op
)
==
tensor
.
IncSubtensor
and
node
.
inputs
[
0
]
.
dtype
==
"float32"
)
:
x
,
y
=
node
.
inputs
[
0
:
2
]
assert
isinstance
(
x
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
y
.
type
,
tensor
.
TensorType
)
...
...
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
cast_op
=
theano
.
tensor
.
Elemwise
(
scal
.
Identity
(
scal
.
specific_out
(
stype
)))
return
cast_op
(
x
)
import
theano.tensor.nnet
@register_opt
()
@local_optimizer
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
...
...
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
return
False
# Convolution
from
theano.tensor.nnet
import
conv
def
_gpu_conv_to_fftconv
(
node
):
# shared helper function for local_conv_fft_valid and local_conv_fft_full.
# we import conv2d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv2d_fft
kwargs
=
{
'border_mode'
:
node
.
op
.
border_mode
}
if
(
node
.
op
.
imshp
is
not
None
and
node
.
op
.
imshp
[
-
1
]
is
not
None
and
node
.
op
.
imshp
[
-
1
]
%
2
==
1
):
if
(
node
.
op
.
imshp
is
not
None
and
node
.
op
.
imshp
[
-
1
]
%
2
==
1
and
node
.
op
.
imshp
[
-
1
]
is
not
None
):
kwargs
[
'pad_last_dim'
]
=
True
# If the user supplied the full nonsymbolic image_shape and
...
...
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
@local_optimizer
([
GpuConv
])
def
local_conv_fft_valid
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
border_mode
==
'valid'
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
fft_opt
):
if
(
node
.
op
.
border_mode
==
'valid'
and
node
.
op
.
fft_opt
and
node
.
op
.
subsample
==
(
1
,
1
)):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
False
...
...
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
@local_optimizer
([
GpuConv
])
def
local_conv_fft_full
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
border_mode
==
'full'
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
fft_opt
):
if
(
node
.
op
.
border_mode
==
'full'
and
node
.
op
.
fft_opt
and
node
.
op
.
subsample
==
(
1
,
1
)):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
...
...
@@ -1586,7 +1580,7 @@ def local_gpu_conv(node):
@local_optimizer
([
GpuConv
])
def
local_conv_gemm
(
node
):
if
(
isinstance
(
node
.
op
,
GpuConv
)
and
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]):
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]):
img
,
kern
=
node
.
inputs
border_mode
=
node
.
op
.
border_mode
...
...
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
'conv_fft'
)
# cuDNN is the second, but only registered if cuDNN is available.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'.
from
.
import
dnn
# We can't check at import if dnn is available, so we must always
# register it. This do not cause problem as if it is not avail, the
# opt will do nothing.
...
...
@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
shapes
=
((
node
.
op
.
bsize
,)
+
node
.
op
.
imshp
,
(
node
.
op
.
nkern
,
nchannels
)
+
node
.
op
.
kshp
)
for
(
var
,
shape
)
in
zip
(
vars
,
shapes
):
if
((
var
in
inputs
)
and
(
shape
is
not
None
)
and
not
any
(
s
is
None
for
s
in
shape
)):
if
((
var
in
inputs
)
and
(
shape
is
not
None
)
and
not
any
(
s
is
None
for
s
in
shape
)):
result
[
var
]
=
theano
.
shared
(
# TODO: Use var.type.filter when cuda_ndarray.filter
...
...
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
gpu_optimizer
.
register
(
"conv3d_fft"
,
local_conv3d_fft
)
from
theano.tensor.nnet.ConvGrad3D
import
ConvGrad3D
@local_optimizer
([
ConvGrad3D
])
def
local_convgrad3d_fft
(
node
):
...
...
@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node):
except
tensor
.
NotScalarConstantError
:
return
False
if
(
isinstance
(
node
.
op
,
ConvGrad3D
)
and
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
# we import conv3d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
...
...
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
gpu_optimizer
.
register
(
"convgrad3d_fft"
,
local_convgrad3d_fft
)
from
theano.tensor.nnet.ConvTransp3D
import
ConvTransp3D
@local_optimizer
([
ConvTransp3D
])
def
local_convtransp3d_fft
(
node
):
...
...
@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node):
except
tensor
.
NotScalarConstantError
:
return
False
if
(
isinstance
(
node
.
op
,
ConvTransp3D
)
and
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
# we import conv3d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
# Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
...
...
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
gpu_optimizer
.
register
(
"convtransp3d_gemm"
,
local_convtransp3d_gemm
)
# Pooling
import
theano.tensor.signal.pool
as
pool
@register_opt
()
@local_optimizer
([
pool
.
Pool
])
def
local_gpu_downsample_factor_max
(
node
):
if
(
isinstance
(
node
.
op
,
pool
.
Pool
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
if
(
isinstance
(
node
.
op
,
pool
.
Pool
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
...
...
@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node):
@register_opt
()
@local_optimizer
([
pool
.
MaxPoolGrad
])
def
local_gpu_downsample_factor_max_grad
(
node
):
if
(
isinstance
(
node
.
op
,
pool
.
MaxPoolGrad
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
if
(
isinstance
(
node
.
op
,
pool
.
MaxPoolGrad
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
if
(
node
.
op
.
padding
!=
(
0
,
0
)
or
node
.
op
.
mode
!=
'max'
or
node
.
op
.
st
!=
node
.
op
.
ds
):
node
.
op
.
mode
!=
'max'
or
node
.
op
.
st
!=
node
.
op
.
ds
):
return
x
,
z
,
gz
=
node
.
inputs
...
...
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
as_cuda_ndarray_variable
(
gx
)))]
from
theano.sandbox.cuda.basic_ops
import
gpu_join
,
GpuJoin
@register_opt
()
@local_optimizer
([
tensor
.
Join
])
def
local_gpu_join
(
node
):
...
...
@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node):
if
isinstance
(
node
.
op
,
GpuAlloc
)
and
not
node
.
op
.
memset_0
:
inp
=
node
.
inputs
[
0
]
if
(
isinstance
(
inp
,
CudaNdarrayConstant
)
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_out
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
old_bcast
=
node
.
outputs
[
0
]
.
type
.
broadcastable
...
...
@@ -2308,8 +2287,9 @@ def local_gpu_eye(node):
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
host_input
.
owner
.
op
.
dtype
==
"float32"
):
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
host_input
.
owner
.
op
.
dtype
==
"float32"
):
if
tensor
.
extract_constant
(
host_input
.
owner
.
inputs
[
2
])
!=
0
:
return
return
[
gpu_eye
(
*
host_input
.
owner
.
inputs
)]
...
...
@@ -2324,7 +2304,7 @@ def local_gpu_eye(node):
def
safe_to_gpu
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
x
.
type
.
dtype
==
'float32'
):
return
as_cuda_ndarray_variable
(
x
)
else
:
...
...
@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def
tensor_to_cuda
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
x
.
type
.
dtype
==
'float32'
):
y
=
CudaNdarrayType
(
broadcastable
=
x
.
type
.
broadcastable
)()
if
x
.
name
:
...
...
@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node):
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
scan_op
.
Scan
)
and
not
host_input
.
owner
.
op
.
info
[
'gpu'
]
and
len
(
host_input
.
owner
.
outputs
)
==
1
):
isinstance
(
host_input
.
owner
.
op
,
scan_op
.
Scan
)
and
not
host_input
.
owner
.
op
.
info
[
'gpu'
]
and
len
(
host_input
.
owner
.
outputs
)
==
1
):
# Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one
...
...
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
return
_outputs
# scan(host_from_gpu) -> host_from_gpu(GPUscan)
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
and
not
node
.
op
.
info
[
'gpu'
]):
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
and
not
node
.
op
.
info
[
'gpu'
]):
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
for
i
in
node
.
inputs
]):
...
...
@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node):
kern
=
kern
.
dimshuffle
(
1
,
0
,
2
,
3
)
# call GpuCorrMM_gradInputs
rval
=
GpuCorrMM_gradInputs
(
'valid'
,
subsample
)(
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
))
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
))
else
:
# need to flip the kernel if necessary
if
node
.
op
.
filter_flip
:
...
...
@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node):
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if
((
subsample
==
(
1
,
1
))
and
(
node
.
op
.
imshp
is
not
None
)
and
(
None
not
in
node
.
op
.
imshp
[
-
2
:])
and
(
node
.
op
.
kshp
is
not
None
)
and
(
None
not
in
node
.
op
.
kshp
)
and
border_mode
!=
"half"
):
(
node
.
op
.
imshp
is
not
None
)
and
(
None
not
in
node
.
op
.
imshp
[
-
2
:])
and
(
node
.
op
.
kshp
is
not
None
)
and
(
None
not
in
node
.
op
.
kshp
)
and
border_mode
!=
"half"
):
# we know the kernel and output size
prod1
=
node
.
op
.
kshp
[
0
]
*
node
.
op
.
kshp
[
1
]
prod2
=
((
node
.
op
.
imshp
[
-
2
]
-
node
.
op
.
kshp
[
0
]
+
1
)
*
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论