Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
54fe4a7f
提交
54fe4a7f
authored
3月 14, 2016
作者:
Chiheb Trabelsi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
opt.py has been modified in order to respect the flake8 style.
上级
1a3948cc
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
61 行增加
和
81 行删除
+61
-81
opt.py
theano/sandbox/cuda/opt.py
+61
-81
没有找到文件。
theano/sandbox/cuda/opt.py
浏览文件 @
54fe4a7f
...
@@ -10,22 +10,32 @@ import warnings
...
@@ -10,22 +10,32 @@ import warnings
import
numpy
import
numpy
from
six.moves
import
reduce
,
xrange
from
six.moves
import
reduce
,
xrange
from
.
import
dnn
import
theano
import
theano
from
theano
import
scalar
as
scal
from
theano
import
scalar
as
scal
from
theano
import
config
,
tensor
,
gof
from
theano
import
config
,
tensor
,
gof
import
theano.ifelse
import
theano.ifelse
import
theano.tensor.signal.pool
import
theano.tensor.nnet
import
theano.tensor.nnet.neighbours
# Convolution
from
theano.tensor.nnet
import
conv
from
theano.tensor.nnet.ConvGrad3D
import
ConvGrad3D
from
theano.tensor.nnet.ConvTransp3D
import
ConvTransp3D
# Pooling
import
theano.tensor.signal.pool
as
pool
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
ProxyDB
,
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
ProxyDB
,
Optimizer
,
TopoOptimizer
,
toolbox
)
Optimizer
,
TopoOptimizer
,
toolbox
)
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.sandbox.cuda.basic_ops
import
gpu_join
,
GpuJoin
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda
import
as_cuda_ndarray_variable
from
theano.sandbox.cuda.basic_ops
import
(
from
theano.sandbox.cuda.basic_ops
import
(
gpu_eye
,
gpu_contiguous
,
gpu_eye
,
gpu_contiguous
,
gpu_from_host
,
host_from_gpu
,
GpuFromHost
,
HostFromGpu
,
gpu_from_host
,
host_from_gpu
,
GpuFromHost
,
HostFromGpu
,
GpuContiguous
,
GpuContiguous
,
GpuElemwise
,
GpuDimShuffle
,
GpuReshape
,
GpuCAReduce
,
GpuElemwise
,
GpuDimShuffle
,
GpuReshape
,
GpuCAReduce
,
GpuFlatten
,
gpu_flatten
,
gpu_flatten
,
GpuSubtensor
,
GpuAdvancedSubtensor1
,
GpuSubtensor
,
GpuAdvancedSubtensor1
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1_dev20
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1_dev20
,
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
,
GpuSplit
,
GpuAllocEmpty
)
GpuIncSubtensor
,
gpu_alloc
,
GpuAlloc
,
gpu_shape
,
GpuSplit
,
GpuAllocEmpty
)
...
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
...
@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
# This is a partial list of CPU ops that can be in some circonstance
# This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization.
# moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date.
# Hopefully, we can keep this list up to date.
import
theano.tensor.signal.pool
import
theano.tensor.nnet.neighbours
cpu_ops_moved_to_gpu
=
[
cpu_ops_moved_to_gpu
=
[
tensor
.
blas
.
Dot22
,
tensor
.
blas
.
Dot22Scalar
,
tensor
.
blas
.
Gemm
,
tensor
.
blas
.
Dot22
,
tensor
.
blas
.
Dot22Scalar
,
tensor
.
blas
.
Gemm
,
tensor
.
blas
.
Gemv
,
tensor
.
blas
.
Ger
,
tensor
.
nnet
.
conv
.
ConvOp
,
tensor
.
blas
.
Gemv
,
tensor
.
blas
.
Ger
,
tensor
.
nnet
.
conv
.
ConvOp
,
...
@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node):
...
@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node):
if
y
.
ndim
==
2
:
if
y
.
ndim
==
2
:
y_
=
y_
.
dimshuffle
(
0
,
1
,
"x"
)
y_
=
y_
.
dimshuffle
(
0
,
1
,
"x"
)
z
=
GpuBatchedDot
()(
as_cuda_ndarray_variable
(
x_
),
z
=
GpuBatchedDot
()(
as_cuda_ndarray_variable
(
x_
),
as_cuda_ndarray_variable
(
y_
))
as_cuda_ndarray_variable
(
y_
))
# unpad z shape
# unpad z shape
if
x
.
ndim
==
2
:
if
x
.
ndim
==
2
:
z
=
z
.
dimshuffle
(
0
,
*
range
(
2
,
z
.
ndim
))
z
=
z
.
dimshuffle
(
0
,
*
range
(
2
,
z
.
ndim
))
...
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
...
@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
if
x
.
type
==
node
.
outputs
[
0
]
.
type
:
if
x
.
type
==
node
.
outputs
[
0
]
.
type
:
return
[
x
]
return
[
x
]
elif
(
all
([
c
!=
"output"
and
isinstance
(
c
.
op
,
GpuFromHost
)
elif
(
all
([
c
!=
"output"
and
isinstance
(
c
.
op
,
GpuFromHost
)
for
c
,
i
in
node
.
outputs
[
0
]
.
clients
])
for
c
,
i
in
node
.
outputs
[
0
]
.
clients
])
and
and
x
.
owner
and
x
.
owner
.
op
.
__class__
in
x
.
owner
and
x
.
owner
.
op
.
__class__
in
cpu_ops_moved_to_gpu
):
cpu_ops_moved_to_gpu
):
# It is not always good to transfer the reduction to
# It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the
# the GPU when the clients are on the GPU but not the
...
@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node):
...
@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# operation with some reduction pattern will probably result
# to slow down.
# to slow down.
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scal
.
basic
.
Sqr
)):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scal
.
basic
.
Sqr
)):
op
=
node
.
op
op
=
node
.
op
inp
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
inp
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
...
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
...
@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
return
[
gpu_flatten
(
host_input
.
owner
.
inputs
[
0
],
outdim
)(
return
[
gpu_flatten
(
host_input
.
owner
.
inputs
[
0
],
outdim
)(
as_cuda_ndarray_variable
(
host_input
.
owner
.
inputs
[
0
]))]
as_cuda_ndarray_variable
(
host_input
.
owner
.
inputs
[
0
]))]
if
isinstance
(
node
.
op
,
tensor
.
Flatten
):
if
isinstance
(
node
.
op
,
tensor
.
Flatten
):
x
,
=
node
.
inputs
x
,
shp
=
node
.
inputs
outdim
=
node
.
op
.
outdim
if
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
if
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
):
outdim
=
node
.
op
.
outdim
outdim
=
node
.
op
.
outdim
gpu_x
,
=
x
.
owner
.
inputs
gpu_x
,
=
x
.
owner
.
inputs
...
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
...
@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
*
coords
)]
*
coords
)]
if
isinstance
(
node
.
op
,
tensor
.
Subtensor
):
if
isinstance
(
node
.
op
,
tensor
.
Subtensor
):
x
=
node
.
inputs
[
0
]
x
=
node
.
inputs
[
0
]
if
(
x
.
owner
and
if
(
x
.
owner
and
x
.
dtype
==
"float32"
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
x
.
dtype
==
"float32"
):
gpu_x
=
x
.
owner
.
inputs
[
0
]
gpu_x
=
x
.
owner
.
inputs
[
0
]
if
(
gpu_x
.
owner
and
if
(
gpu_x
.
owner
and
# And it is a shared var or an input of the graph.
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)
and
not
(
gpu_x
.
owner
.
inputs
[
0
]
.
owner
)
and
# And it is a shared var or an input of the graph.
isinstance
(
gpu_x
.
owner
.
op
,
GpuFromHost
)):
not
gpu_x
.
owner
.
inputs
[
0
]
.
owner
):
if
len
(
x
.
clients
)
==
1
:
if
len
(
x
.
clients
)
==
1
:
if
any
([
n
==
'output'
or
isinstance
(
n
.
op
,
GpuOp
)
if
any
([
n
==
'output'
or
isinstance
(
n
.
op
,
GpuOp
)
...
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
...
@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
'least
\'
0.6
\'
.'
,
stacklevel
=
1
)
'least
\'
0.6
\'
.'
,
stacklevel
=
1
)
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
if
(
compute_capability
<
2
or
if
(
compute_capability
<
2
or
y
.
ndim
!=
2
or
x
.
ndim
!=
2
):
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
set_instead_of_inc
=
set_instead_of_inc
)
...
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
...
@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
active_device_no
=
theano
.
sandbox
.
cuda
.
active_device_number
()
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
compute_capability
=
device_properties
(
active_device_no
)[
'major'
]
if
(
compute_capability
<
2
or
if
(
compute_capability
<
2
or
y
.
ndim
!=
2
or
x
.
ndim
!=
2
):
x
.
ndim
!=
2
or
y
.
ndim
!=
2
):
gpu_op
=
GpuAdvancedIncSubtensor1
(
gpu_op
=
GpuAdvancedIncSubtensor1
(
set_instead_of_inc
=
set_instead_of_inc
)
set_instead_of_inc
=
set_instead_of_inc
)
else
:
else
:
...
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
...
@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
# Incrementing a float32 x results in a float32
# Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast
# output even if y is float64, so we can downcast
# y to put it on GPU
# y to put it on GPU
elif
type
(
node
.
op
)
==
tensor
.
IncSubtensor
and
\
elif
(
type
(
node
.
op
)
==
tensor
.
IncSubtensor
and
node
.
inputs
[
0
]
.
dtype
==
"float32"
:
node
.
inputs
[
0
]
.
dtype
==
"float32"
)
:
x
,
y
=
node
.
inputs
[
0
:
2
]
x
,
y
=
node
.
inputs
[
0
:
2
]
assert
isinstance
(
x
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
x
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
y
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
y
.
type
,
tensor
.
TensorType
)
...
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
...
@@ -1346,8 +1349,6 @@ def cast(x, dtype):
cast_op
=
theano
.
tensor
.
Elemwise
(
scal
.
Identity
(
scal
.
specific_out
(
stype
)))
cast_op
=
theano
.
tensor
.
Elemwise
(
scal
.
Identity
(
scal
.
specific_out
(
stype
)))
return
cast_op
(
x
)
return
cast_op
(
x
)
import
theano.tensor.nnet
@register_opt
()
@register_opt
()
@local_optimizer
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
@local_optimizer
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
])
...
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
...
@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
return
False
return
False
# Convolution
from
theano.tensor.nnet
import
conv
def
_gpu_conv_to_fftconv
(
node
):
def
_gpu_conv_to_fftconv
(
node
):
# shared helper function for local_conv_fft_valid and local_conv_fft_full.
# shared helper function for local_conv_fft_valid and local_conv_fft_full.
# we import conv2d_fft locally to avoid pycuda warnings
# we import conv2d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv2d_fft
from
theano.sandbox.cuda.fftconv
import
conv2d_fft
kwargs
=
{
'border_mode'
:
node
.
op
.
border_mode
}
kwargs
=
{
'border_mode'
:
node
.
op
.
border_mode
}
if
(
node
.
op
.
imshp
is
not
None
and
if
(
node
.
op
.
imshp
is
not
None
and
node
.
op
.
imshp
[
-
1
]
%
2
==
1
and
node
.
op
.
imshp
[
-
1
]
is
not
None
and
node
.
op
.
imshp
[
-
1
]
is
not
None
):
node
.
op
.
imshp
[
-
1
]
%
2
==
1
):
kwargs
[
'pad_last_dim'
]
=
True
kwargs
[
'pad_last_dim'
]
=
True
# If the user supplied the full nonsymbolic image_shape and
# If the user supplied the full nonsymbolic image_shape and
...
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
...
@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_fft_valid
(
node
):
def
local_conv_fft_valid
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
border_mode
==
'valid'
and
if
(
node
.
op
.
border_mode
==
'valid'
and
node
.
op
.
fft_opt
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
subsample
==
(
1
,
1
)):
node
.
op
.
fft_opt
):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
[
_gpu_conv_to_fftconv
(
node
)]
return
False
return
False
...
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
...
@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_fft_full
(
node
):
def
local_conv_fft_full
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
border_mode
==
'full'
and
if
(
node
.
op
.
border_mode
==
'full'
and
node
.
op
.
fft_opt
and
node
.
op
.
subsample
==
(
1
,
1
)
and
node
.
op
.
subsample
==
(
1
,
1
)):
node
.
op
.
fft_opt
):
return
[
_gpu_conv_to_fftconv
(
node
)]
return
[
_gpu_conv_to_fftconv
(
node
)]
return
return
...
@@ -1586,7 +1580,7 @@ def local_gpu_conv(node):
...
@@ -1586,7 +1580,7 @@ def local_gpu_conv(node):
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_gemm
(
node
):
def
local_conv_gemm
(
node
):
if
(
isinstance
(
node
.
op
,
GpuConv
)
and
if
(
isinstance
(
node
.
op
,
GpuConv
)
and
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]):
node
.
op
.
border_mode
in
[
'full'
,
'valid'
]):
img
,
kern
=
node
.
inputs
img
,
kern
=
node
.
inputs
border_mode
=
node
.
op
.
border_mode
border_mode
=
node
.
op
.
border_mode
...
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
...
@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
'conv_fft'
)
'conv_fft'
)
# cuDNN is the second, but only registered if cuDNN is available.
# cuDNN is the second, but only registered if cuDNN is available.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'.
from
.
import
dnn
# We can't check at import if dnn is available, so we must always
# We can't check at import if dnn is available, so we must always
# register it. This do not cause problem as if it is not avail, the
# register it. This do not cause problem as if it is not avail, the
# opt will do nothing.
# opt will do nothing.
...
@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
...
@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
shapes
=
((
node
.
op
.
bsize
,)
+
node
.
op
.
imshp
,
shapes
=
((
node
.
op
.
bsize
,)
+
node
.
op
.
imshp
,
(
node
.
op
.
nkern
,
nchannels
)
+
node
.
op
.
kshp
)
(
node
.
op
.
nkern
,
nchannels
)
+
node
.
op
.
kshp
)
for
(
var
,
shape
)
in
zip
(
vars
,
shapes
):
for
(
var
,
shape
)
in
zip
(
vars
,
shapes
):
if
((
var
in
inputs
)
and
if
((
var
in
inputs
)
and
(
shape
is
not
None
)
and
(
shape
is
not
None
)
and
not
any
(
s
is
None
for
s
in
shape
)):
not
any
(
s
is
None
for
s
in
shape
)):
result
[
var
]
=
theano
.
shared
(
result
[
var
]
=
theano
.
shared
(
# TODO: Use var.type.filter when cuda_ndarray.filter
# TODO: Use var.type.filter when cuda_ndarray.filter
...
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
...
@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
gpu_optimizer
.
register
(
"conv3d_fft"
,
local_conv3d_fft
)
gpu_optimizer
.
register
(
"conv3d_fft"
,
local_conv3d_fft
)
from
theano.tensor.nnet.ConvGrad3D
import
ConvGrad3D
@local_optimizer
([
ConvGrad3D
])
@local_optimizer
([
ConvGrad3D
])
def
local_convgrad3d_fft
(
node
):
def
local_convgrad3d_fft
(
node
):
...
@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node):
...
@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node):
except
tensor
.
NotScalarConstantError
:
except
tensor
.
NotScalarConstantError
:
return
False
return
False
if
(
isinstance
(
node
.
op
,
ConvGrad3D
)
and
if
(
isinstance
(
node
.
op
,
ConvGrad3D
)
and
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
# we import conv3d_fft locally to avoid pycuda warnings
# we import conv3d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
...
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
...
@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
gpu_optimizer
.
register
(
"convgrad3d_fft"
,
local_convgrad3d_fft
)
gpu_optimizer
.
register
(
"convgrad3d_fft"
,
local_convgrad3d_fft
)
from
theano.tensor.nnet.ConvTransp3D
import
ConvTransp3D
@local_optimizer
([
ConvTransp3D
])
@local_optimizer
([
ConvTransp3D
])
def
local_convtransp3d_fft
(
node
):
def
local_convtransp3d_fft
(
node
):
...
@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node):
...
@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node):
except
tensor
.
NotScalarConstantError
:
except
tensor
.
NotScalarConstantError
:
return
False
return
False
if
(
isinstance
(
node
.
op
,
ConvTransp3D
)
and
if
(
isinstance
(
node
.
op
,
ConvTransp3D
)
and
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
(
stride_x
,
stride_y
,
stride_z
)
==
(
1
,
1
,
1
)):
# we import conv3d_fft locally to avoid pycuda warnings
# we import conv3d_fft locally to avoid pycuda warnings
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
from
theano.sandbox.cuda.fftconv
import
conv3d_fft
# Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
# Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
...
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
...
@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
gpu_optimizer
.
register
(
"convtransp3d_gemm"
,
local_convtransp3d_gemm
)
gpu_optimizer
.
register
(
"convtransp3d_gemm"
,
local_convtransp3d_gemm
)
# Pooling
import
theano.tensor.signal.pool
as
pool
@register_opt
()
@register_opt
()
@local_optimizer
([
pool
.
Pool
])
@local_optimizer
([
pool
.
Pool
])
def
local_gpu_downsample_factor_max
(
node
):
def
local_gpu_downsample_factor_max
(
node
):
if
(
isinstance
(
node
.
op
,
pool
.
Pool
)
if
(
isinstance
(
node
.
op
,
pool
.
Pool
)
and
and
node
.
op
.
ds
==
node
.
op
.
st
):
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
'mode'
)
...
@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node):
...
@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node):
@register_opt
()
@register_opt
()
@local_optimizer
([
pool
.
MaxPoolGrad
])
@local_optimizer
([
pool
.
MaxPoolGrad
])
def
local_gpu_downsample_factor_max_grad
(
node
):
def
local_gpu_downsample_factor_max_grad
(
node
):
if
(
isinstance
(
node
.
op
,
pool
.
MaxPoolGrad
)
and
if
(
isinstance
(
node
.
op
,
pool
.
MaxPoolGrad
)
and
node
.
op
.
ds
==
node
.
op
.
st
):
node
.
op
.
ds
==
node
.
op
.
st
):
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
assert
node
.
op
.
__props__
==
(
'ds'
,
'ignore_border'
,
'st'
,
'padding'
,
'mode'
)
'mode'
)
if
(
node
.
op
.
padding
!=
(
0
,
0
)
or
if
(
node
.
op
.
padding
!=
(
0
,
0
)
or
node
.
op
.
mode
!=
'max'
or
node
.
op
.
mode
!=
'max'
or
node
.
op
.
st
!=
node
.
op
.
ds
):
node
.
op
.
st
!=
node
.
op
.
ds
):
return
return
x
,
z
,
gz
=
node
.
inputs
x
,
z
,
gz
=
node
.
inputs
...
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
...
@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
as_cuda_ndarray_variable
(
gx
)))]
as_cuda_ndarray_variable
(
gx
)))]
from
theano.sandbox.cuda.basic_ops
import
gpu_join
,
GpuJoin
@register_opt
()
@register_opt
()
@local_optimizer
([
tensor
.
Join
])
@local_optimizer
([
tensor
.
Join
])
def
local_gpu_join
(
node
):
def
local_gpu_join
(
node
):
...
@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node):
...
@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node):
if
isinstance
(
node
.
op
,
GpuAlloc
)
and
not
node
.
op
.
memset_0
:
if
isinstance
(
node
.
op
,
GpuAlloc
)
and
not
node
.
op
.
memset_0
:
inp
=
node
.
inputs
[
0
]
inp
=
node
.
inputs
[
0
]
if
(
isinstance
(
inp
,
CudaNdarrayConstant
)
and
if
(
isinstance
(
inp
,
CudaNdarrayConstant
)
and
inp
.
data
.
size
==
1
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_out
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
new_out
=
GpuAlloc
(
memset_0
=
True
)(
*
node
.
inputs
)
old_bcast
=
node
.
outputs
[
0
]
.
type
.
broadcastable
old_bcast
=
node
.
outputs
[
0
]
.
type
.
broadcastable
...
@@ -2308,8 +2287,9 @@ def local_gpu_eye(node):
...
@@ -2308,8 +2287,9 @@ def local_gpu_eye(node):
if
isinstance
(
node
.
op
,
GpuFromHost
):
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
host_input
=
node
.
inputs
[
0
]
if
(
host_input
.
owner
and
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
isinstance
(
host_input
.
owner
.
op
,
tensor
.
Eye
)
and
host_input
.
owner
.
op
.
dtype
==
"float32"
):
host_input
.
owner
.
op
.
dtype
==
"float32"
):
if
tensor
.
extract_constant
(
host_input
.
owner
.
inputs
[
2
])
!=
0
:
if
tensor
.
extract_constant
(
host_input
.
owner
.
inputs
[
2
])
!=
0
:
return
return
return
[
gpu_eye
(
*
host_input
.
owner
.
inputs
)]
return
[
gpu_eye
(
*
host_input
.
owner
.
inputs
)]
...
@@ -2324,7 +2304,7 @@ def local_gpu_eye(node):
...
@@ -2324,7 +2304,7 @@ def local_gpu_eye(node):
def
safe_to_gpu
(
x
):
def
safe_to_gpu
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
x
.
type
.
dtype
==
'float32'
):
return
as_cuda_ndarray_variable
(
x
)
return
as_cuda_ndarray_variable
(
x
)
else
:
else
:
...
@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
...
@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def
tensor_to_cuda
(
x
):
def
tensor_to_cuda
(
x
):
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
if
(
isinstance
(
x
.
type
,
tensor
.
TensorType
)
and
x
.
type
.
dtype
==
'float32'
):
x
.
type
.
dtype
==
'float32'
):
y
=
CudaNdarrayType
(
broadcastable
=
x
.
type
.
broadcastable
)()
y
=
CudaNdarrayType
(
broadcastable
=
x
.
type
.
broadcastable
)()
if
x
.
name
:
if
x
.
name
:
...
@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node):
...
@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node):
if
isinstance
(
node
.
op
,
GpuFromHost
):
if
isinstance
(
node
.
op
,
GpuFromHost
):
host_input
=
node
.
inputs
[
0
]
host_input
=
node
.
inputs
[
0
]
if
(
host_input
.
owner
and
if
(
host_input
.
owner
and
isinstance
(
host_input
.
owner
.
op
,
scan_op
.
Scan
)
and
isinstance
(
host_input
.
owner
.
op
,
scan_op
.
Scan
)
and
not
host_input
.
owner
.
op
.
info
[
'gpu'
]
and
not
host_input
.
owner
.
op
.
info
[
'gpu'
]
and
len
(
host_input
.
owner
.
outputs
)
==
1
):
len
(
host_input
.
owner
.
outputs
)
==
1
):
# Note that we are not doing the right thing here !!
# Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one
# This is because the local optimizer expects only one
...
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
...
@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
return
_outputs
return
_outputs
# scan(host_from_gpu) -> host_from_gpu(GPUscan)
# scan(host_from_gpu) -> host_from_gpu(GPUscan)
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
if
(
type
(
node
.
op
)
==
scan_op
.
Scan
and
and
not
node
.
op
.
info
[
'gpu'
]):
not
node
.
op
.
info
[
'gpu'
]):
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
if
any
([(
i
.
owner
and
isinstance
(
i
.
owner
.
op
,
HostFromGpu
))
for
i
in
node
.
inputs
]):
for
i
in
node
.
inputs
]):
...
@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node):
...
@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node):
kern
=
kern
.
dimshuffle
(
1
,
0
,
2
,
3
)
kern
=
kern
.
dimshuffle
(
1
,
0
,
2
,
3
)
# call GpuCorrMM_gradInputs
# call GpuCorrMM_gradInputs
rval
=
GpuCorrMM_gradInputs
(
'valid'
,
subsample
)(
rval
=
GpuCorrMM_gradInputs
(
'valid'
,
subsample
)(
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
))
gpu_contiguous
(
kern
),
gpu_contiguous
(
img
))
else
:
else
:
# need to flip the kernel if necessary
# need to flip the kernel if necessary
if
node
.
op
.
filter_flip
:
if
node
.
op
.
filter_flip
:
...
@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node):
...
@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node):
# GpuConv does not always store information on the batchsize and
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
# channels, though, so we only use what information we have.)
if
((
subsample
==
(
1
,
1
))
and
if
((
subsample
==
(
1
,
1
))
and
(
node
.
op
.
imshp
is
not
None
)
and
(
node
.
op
.
imshp
is
not
None
)
and
(
None
not
in
node
.
op
.
imshp
[
-
2
:])
and
(
None
not
in
node
.
op
.
imshp
[
-
2
:])
and
(
node
.
op
.
kshp
is
not
None
)
and
(
node
.
op
.
kshp
is
not
None
)
and
(
None
not
in
node
.
op
.
kshp
)
and
(
None
not
in
node
.
op
.
kshp
)
and
border_mode
!=
"half"
):
border_mode
!=
"half"
):
# we know the kernel and output size
# we know the kernel and output size
prod1
=
node
.
op
.
kshp
[
0
]
*
node
.
op
.
kshp
[
1
]
prod1
=
node
.
op
.
kshp
[
0
]
*
node
.
op
.
kshp
[
1
]
prod2
=
((
node
.
op
.
imshp
[
-
2
]
-
node
.
op
.
kshp
[
0
]
+
1
)
*
prod2
=
((
node
.
op
.
imshp
[
-
2
]
-
node
.
op
.
kshp
[
0
]
+
1
)
*
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论