Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
a24fd9bb
提交
a24fd9bb
authored
7月 12, 2016
作者:
Frédéric Bastien
提交者:
GitHub
7月 12, 2016
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #4570 from Sentient07/new_graph2gpu
New graph2gpu
上级
2b371c6f
195f9b1d
隐藏空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
843 行增加
和
332 行删除
+843
-332
ops.py
theano/compile/ops.py
+16
-0
profiling.py
theano/compile/profiling.py
+1
-1
opt.py
theano/gof/opt.py
+1
-1
basic_ops.py
theano/gpuarray/basic_ops.py
+28
-2
dnn.py
theano/gpuarray/dnn.py
+146
-90
elemwise.py
theano/gpuarray/elemwise.py
+12
-0
extra_ops.py
theano/gpuarray/extra_ops.py
+16
-15
fft.py
theano/gpuarray/fft.py
+5
-3
multinomial.py
theano/gpuarray/multinomial.py
+9
-8
neighbours.py
theano/gpuarray/neighbours.py
+6
-6
nerv.py
theano/gpuarray/nerv.py
+8
-7
opt.py
theano/gpuarray/opt.py
+504
-164
opt_util.py
theano/gpuarray/opt_util.py
+2
-3
test_abstractconv.py
theano/gpuarray/tests/test_abstractconv.py
+2
-0
test_basic_ops.py
theano/gpuarray/tests/test_basic_ops.py
+1
-1
test_opt.py
theano/gpuarray/tests/test_opt.py
+31
-7
type.py
theano/gpuarray/type.py
+2
-2
rng_mrg.py
theano/sandbox/rng_mrg.py
+15
-9
scan_utils.py
theano/scan_module/scan_utils.py
+7
-5
basic.py
theano/tensor/basic.py
+16
-7
abstract_conv.py
theano/tensor/nnet/abstract_conv.py
+12
-1
test_basic.py
theano/tensor/tests/test_basic.py
+3
-0
没有找到文件。
theano/compile/ops.py
浏览文件 @
a24fd9bb
...
@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
...
@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
def
infer_shape
(
self
,
node
,
input_shapes
):
def
infer_shape
(
self
,
node
,
input_shapes
):
return
[()]
return
[()]
def
connection_pattern
(
self
,
node
):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return
[[
False
]]
def
grad
(
self
,
inp
,
grads
):
def
grad
(
self
,
inp
,
grads
):
return
[
theano
.
gradient
.
grad_not_implemented
(
return
[
theano
.
gradient
.
grad_not_implemented
(
op
=
self
,
x_pos
=
0
,
x
=
inp
[
0
],
op
=
self
,
x_pos
=
0
,
x
=
inp
[
0
],
...
@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
...
@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
return
var
.
shape
[
i
]
return
var
.
shape
[
i
]
def
shape_i_op
(
i
):
key
=
i
if
key
not
in
shape_i_op
.
cache
:
shape_i_op
.
cache
[
key
]
=
Shape_i
(
i
)
return
shape_i_op
.
cache
[
key
]
shape_i_op
.
cache
=
{}
def
register_shape_i_c_code
(
typ
,
code
,
check_input
,
version
=
()):
def
register_shape_i_c_code
(
typ
,
code
,
check_input
,
version
=
()):
"""
"""
Tell Shape_i how to generate C code for a Theano Type.
Tell Shape_i how to generate C code for a Theano Type.
...
...
theano/compile/profiling.py
浏览文件 @
a24fd9bb
...
@@ -54,7 +54,7 @@ def _atexit_print_fn():
...
@@ -54,7 +54,7 @@ def _atexit_print_fn():
destination_file
=
open
(
config
.
profiling
.
destination
,
'w'
)
destination_file
=
open
(
config
.
profiling
.
destination
,
'w'
)
for
ps
in
_atexit_print_list
:
for
ps
in
_atexit_print_list
:
if
ps
.
fct_callcount
or
ps
.
compile_time
>
0
:
if
ps
.
fct_callcount
>=
1
or
ps
.
compile_time
>
1
:
ps
.
summary
(
file
=
destination_file
,
ps
.
summary
(
file
=
destination_file
,
n_ops_to_print
=
config
.
profiling
.
n_ops
,
n_ops_to_print
=
config
.
profiling
.
n_ops
,
n_apply_to_print
=
config
.
profiling
.
n_apply
)
n_apply_to_print
=
config
.
profiling
.
n_apply
)
...
...
theano/gof/opt.py
浏览文件 @
a24fd9bb
...
@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
...
@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for
(
t
,
count
,
n_created
,
o
)
in
count_opt
[::
-
1
]:
for
(
t
,
count
,
n_created
,
o
)
in
count_opt
[::
-
1
]:
print
(
blanc
,
'
%.3
fs -
%
d -
%
d -
%
s'
%
(
print
(
blanc
,
'
%.3
fs -
%
d -
%
d -
%
s'
%
(
t
,
count
,
n_created
,
o
),
file
=
stream
)
t
,
count
,
n_created
,
o
),
file
=
stream
)
print
(
blanc
,
'
%.3
fs - in
%
d optimization that w
h
ere not used (display only those with a runtime > 0)'
%
(
print
(
blanc
,
'
%.3
fs - in
%
d optimization that were not used (display only those with a runtime > 0)'
%
(
not_used_time
,
len
(
not_used
)),
file
=
stream
)
not_used_time
,
len
(
not_used
)),
file
=
stream
)
not_used
.
sort
(
key
=
lambda
nu
:
(
nu
[
0
],
str
(
nu
[
1
])))
not_used
.
sort
(
key
=
lambda
nu
:
(
nu
[
0
],
str
(
nu
[
1
])))
for
(
t
,
o
)
in
not_used
[::
-
1
]:
for
(
t
,
o
)
in
not_used
[::
-
1
]:
...
...
theano/gpuarray/basic_ops.py
浏览文件 @
a24fd9bb
...
@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
...
@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor
# If we couldn't deal with transfers, then maybe it's a tensor
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
GpuFromH
ost
(
context_name
)(
x
)
return
gpu_from_h
ost
(
context_name
)(
x
)
# Try _as_GpuArrayVariable if possible
# Try _as_GpuArrayVariable if possible
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
...
@@ -544,7 +544,7 @@ class HostFromGpu(Op):
...
@@ -544,7 +544,7 @@ class HostFromGpu(Op):
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
gz
,
=
grads
return
[
GpuFromH
ost
(
inputs
[
0
]
.
type
.
context_name
)(
gz
)]
return
[
gpu_from_h
ost
(
inputs
[
0
]
.
type
.
context_name
)(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
ev
,
=
eval_points
...
@@ -647,6 +647,14 @@ class GpuFromHost(Op):
...
@@ -647,6 +647,14 @@ class GpuFromHost(Op):
return
(
9
,)
return
(
9
,)
# Caching GPUAlloc
def
gpu_from_host
(
ctx
):
if
ctx
not
in
gpu_alloc
.
cache
:
gpu_from_host
.
cache
[
ctx
]
=
GpuFromHost
(
ctx
)
return
gpu_from_host
.
cache
[
ctx
]
gpu_from_host
.
cache
=
{}
class
GpuToGpu
(
Op
):
class
GpuToGpu
(
Op
):
"""
"""
Transfer data between GPUs.
Transfer data between GPUs.
...
@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
...
@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
return
True
return
True
# Caching GPUAlloc
def
gpu_alloc
(
ctx
,
memset_0
=
False
):
key
=
(
ctx
,
memset_0
)
if
key
not
in
gpu_alloc
.
cache
:
gpu_alloc
.
cache
[
key
]
=
GpuAlloc
(
ctx
,
memset_0
)
return
gpu_alloc
.
cache
[
key
]
gpu_alloc
.
cache
=
{}
class
GpuAllocEmpty
(
HideC
,
Alloc
):
class
GpuAllocEmpty
(
HideC
,
Alloc
):
"""
"""
Allocate uninitialized memory on the GPU.
Allocate uninitialized memory on the GPU.
...
@@ -956,6 +973,14 @@ def empty_like(var):
...
@@ -956,6 +973,14 @@ def empty_like(var):
return
GpuAllocEmpty
(
var
.
type
.
dtype
,
var
.
type
.
context_name
)(
*
var
.
shape
)
return
GpuAllocEmpty
(
var
.
type
.
dtype
,
var
.
type
.
context_name
)(
*
var
.
shape
)
def
gpu_alloc_empty
(
ctx
,
dtype
):
key
=
(
dtype
,
ctx
)
if
key
not
in
gpu_alloc_empty
.
cache
:
gpu_alloc_empty
.
cache
[
key
]
=
GpuAllocEmpty
(
dtype
,
ctx
)
return
gpu_alloc_empty
.
cache
[
key
]
gpu_alloc_empty
.
cache
=
{}
class
GpuContiguous
(
Op
):
class
GpuContiguous
(
Op
):
"""
"""
Return a C contiguous version of the input.
Return a C contiguous version of the input.
...
@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
...
@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
def
make_node
(
self
,
x
,
shp
):
def
make_node
(
self
,
x
,
shp
):
ctx_name
=
infer_context_name
(
x
)
ctx_name
=
infer_context_name
(
x
)
x
=
as_gpuarray_variable
(
x
,
context_name
=
ctx_name
)
x
=
as_gpuarray_variable
(
x
,
context_name
=
ctx_name
)
shp
=
tensor
.
as_tensor_variable
(
shp
)
res
=
host_from_gpu
(
x
)
.
reshape
(
shp
,
ndim
=
self
.
ndim
)
res
=
host_from_gpu
(
x
)
.
reshape
(
shp
,
ndim
=
self
.
ndim
)
otype
=
GpuArrayType
(
dtype
=
res
.
dtype
,
otype
=
GpuArrayType
(
dtype
=
res
.
dtype
,
broadcastable
=
res
.
broadcastable
,
broadcastable
=
res
.
broadcastable
,
...
...
theano/gpuarray/dnn.py
浏览文件 @
a24fd9bb
...
@@ -14,7 +14,7 @@ from theano.gof import Optimizer, local_optimizer, COp
...
@@ -14,7 +14,7 @@ from theano.gof import Optimizer, local_optimizer, COp
from
theano.gof.cmodule
import
GCC_compiler
from
theano.gof.cmodule
import
GCC_compiler
from
theano.gof.type
import
CDataType
,
Generic
from
theano.gof.type
import
CDataType
,
Generic
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.compile.ops
import
shape_i
from
theano.compile.ops
import
shape_i
,
shape_i_op
from
theano.tensor.nnet
import
LogSoftmax
,
SoftmaxGrad
from
theano.tensor.nnet
import
LogSoftmax
,
SoftmaxGrad
from
theano.tensor.nnet.abstract_conv
import
(
AbstractConv2d
,
from
theano.tensor.nnet.abstract_conv
import
(
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradWeights
,
...
@@ -23,15 +23,18 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
...
@@ -23,15 +23,18 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from
theano.tensor.signal.pool
import
(
from
theano.tensor.signal.pool
import
(
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
from
.
import
pygpu
from
.
import
pygpu
from
.type
import
get_context
,
gpu_context_type
,
list_contexts
,
GpuArrayType
from
.type
import
get_context
,
gpu_context_type
,
list_contexts
from
.basic_ops
import
(
as_gpuarray_variable
,
infer_context_name
,
from
.basic_ops
import
(
as_gpuarray_variable
,
infer_context_name
,
gpu_contiguous
,
GpuAllocEmpty
,
empty_like
)
gpu_contiguous
,
gpu_alloc_empty
,
empty_like
,
GpuArrayType
)
from
.elemwise
import
GpuElemwise
from
.elemwise
import
GpuElemwise
# These don't exist in gpuarray
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from
.nnet
import
GpuSoftmax
from
.nnet
import
GpuSoftmax
from
.opt
import
gpu_seqopt
,
register_opt
,
conv_groupopt
,
op_lifter
from
.opt
import
(
gpu_seqopt
,
register_opt
,
conv_groupopt
,
op_lifter
,
register_opt2
)
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
from
theano.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
from
theano.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
...
@@ -94,6 +97,7 @@ def dnn_present():
...
@@ -94,6 +97,7 @@ def dnn_present():
if
config
.
dnn
.
enabled
==
"False"
:
if
config
.
dnn
.
enabled
==
"False"
:
dnn_present
.
msg
=
"Disabled by dnn.enabled flag"
dnn_present
.
msg
=
"Disabled by dnn.enabled flag"
dnn_present
.
avail
=
False
dnn_present
.
avail
=
False
return
False
if
pygpu
is
None
:
if
pygpu
is
None
:
dnn_present
.
msg
=
"PyGPU not available"
dnn_present
.
msg
=
"PyGPU not available"
...
@@ -370,6 +374,19 @@ class GpuDnnConvDesc(COp):
...
@@ -370,6 +374,19 @@ class GpuDnnConvDesc(COp):
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
super
(
GpuDnnConvDesc
,
self
)
.
c_code_cache_version
(),
version
())
return
(
super
(
GpuDnnConvDesc
,
self
)
.
c_code_cache_version
(),
version
())
def
gpu_dnn_conv_desc
(
border_mode
,
subsample
=
(
1
,
1
),
conv_mode
=
'conv'
,
precision
=
"float32"
):
key
=
(
border_mode
,
subsample
,
conv_mode
,
precision
)
if
key
not
in
gpu_dnn_conv_desc
.
cache
:
gpu_dnn_conv_desc
.
cache
[
key
]
=
GpuDnnConvDesc
(
border_mode
,
subsample
,
conv_mode
,
precision
)
return
gpu_dnn_conv_desc
.
cache
[
key
]
gpu_dnn_conv_desc
.
cache
=
{}
# scalar constants
# scalar constants
_zero
=
constant
(
numpy
.
asarray
(
0.0
,
dtype
=
'float64'
))
_zero
=
constant
(
numpy
.
asarray
(
0.0
,
dtype
=
'float64'
))
_one
=
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float64'
))
_one
=
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float64'
))
...
@@ -526,8 +543,8 @@ class GpuDnnConv(DnnBase):
...
@@ -526,8 +543,8 @@ class GpuDnnConv(DnnBase):
top
=
gpu_contiguous
(
top
)
top
=
gpu_contiguous
(
top
)
d_img
=
GpuDnnConvG
radI
()(
kerns
,
top
,
empty_like
(
img
),
desc
)
d_img
=
gpu_dnn_conv_g
radI
()(
kerns
,
top
,
empty_like
(
img
),
desc
)
d_kerns
=
GpuDnnConvG
radW
()(
img
,
top
,
empty_like
(
kerns
),
desc
)
d_kerns
=
gpu_dnn_conv_g
radW
()(
img
,
top
,
empty_like
(
kerns
),
desc
)
d_alpha
=
grad_not_implemented
(
self
,
4
,
alpha
)
d_alpha
=
grad_not_implemented
(
self
,
4
,
alpha
)
d_beta
=
grad_not_implemented
(
self
,
5
,
beta
)
d_beta
=
grad_not_implemented
(
self
,
5
,
beta
)
...
@@ -564,6 +581,14 @@ class GpuDnnConv(DnnBase):
...
@@ -564,6 +581,14 @@ class GpuDnnConv(DnnBase):
return
[
shape
[
2
]]
return
[
shape
[
2
]]
def
gpu_dnn_conv
(
algo
=
None
,
inplace
=
False
):
key
=
(
algo
,
inplace
)
if
key
not
in
gpu_dnn_conv
.
cache
:
gpu_dnn_conv
.
cache
[
key
]
=
GpuDnnConv
(
algo
,
inplace
)
return
gpu_dnn_conv
.
cache
[
key
]
gpu_dnn_conv
.
cache
=
{}
class
GpuDnnConvGradW
(
DnnBase
):
class
GpuDnnConvGradW
(
DnnBase
):
"""
"""
...
@@ -608,8 +633,8 @@ class GpuDnnConvGradW(DnnBase):
...
@@ -608,8 +633,8 @@ class GpuDnnConvGradW(DnnBase):
kerns
=
gpu_contiguous
(
kerns
)
kerns
=
gpu_contiguous
(
kerns
)
d_img
=
GpuDnnConvG
radI
()(
kerns
,
top
,
empty_like
(
img
),
desc
)
d_img
=
gpu_dnn_conv_g
radI
()(
kerns
,
top
,
empty_like
(
img
),
desc
)
d_top
=
GpuDnnC
onv
()(
img
,
kerns
,
empty_like
(
top
),
desc
)
d_top
=
gpu_dnn_c
onv
()(
img
,
kerns
,
empty_like
(
top
),
desc
)
d_alpha
=
grad_not_implemented
(
self
,
4
,
alpha
)
d_alpha
=
grad_not_implemented
(
self
,
4
,
alpha
)
d_beta
=
grad_not_implemented
(
self
,
5
,
beta
)
d_beta
=
grad_not_implemented
(
self
,
5
,
beta
)
...
@@ -686,6 +711,14 @@ class GpuDnnConvGradW(DnnBase):
...
@@ -686,6 +711,14 @@ class GpuDnnConvGradW(DnnBase):
return
[
shape
[
2
]]
return
[
shape
[
2
]]
def
gpu_dnn_conv_gradW
(
algo
=
None
,
inplace
=
False
):
key
=
(
algo
,
inplace
)
if
key
not
in
gpu_dnn_conv_gradW
.
cache
:
gpu_dnn_conv_gradW
.
cache
[
key
]
=
GpuDnnConvGradW
(
inplace
,
algo
)
return
gpu_dnn_conv_gradW
.
cache
[
key
]
gpu_dnn_conv_gradW
.
cache
=
{}
class
GpuDnnConvGradI
(
DnnBase
):
class
GpuDnnConvGradI
(
DnnBase
):
"""
"""
...
@@ -741,8 +774,8 @@ class GpuDnnConvGradI(DnnBase):
...
@@ -741,8 +774,8 @@ class GpuDnnConvGradI(DnnBase):
img
=
gpu_contiguous
(
img
)
img
=
gpu_contiguous
(
img
)
d_kerns
=
GpuDnnConvG
radW
()(
img
,
top
,
empty_like
(
kerns
),
desc
)
d_kerns
=
gpu_dnn_conv_g
radW
()(
img
,
top
,
empty_like
(
kerns
),
desc
)
d_top
=
GpuDnnC
onv
()(
img
,
kerns
,
empty_like
(
top
),
desc
)
d_top
=
gpu_dnn_c
onv
()(
img
,
kerns
,
empty_like
(
top
),
desc
)
d_alpha
=
grad_not_implemented
(
self
,
4
,
alpha
)
d_alpha
=
grad_not_implemented
(
self
,
4
,
alpha
)
d_beta
=
grad_not_implemented
(
self
,
5
,
beta
)
d_beta
=
grad_not_implemented
(
self
,
5
,
beta
)
...
@@ -823,6 +856,14 @@ class GpuDnnConvGradI(DnnBase):
...
@@ -823,6 +856,14 @@ class GpuDnnConvGradI(DnnBase):
return
[
shape
[
2
]]
return
[
shape
[
2
]]
def
gpu_dnn_conv_gradI
(
algo
=
None
,
inplace
=
False
):
key
=
(
algo
,
inplace
)
if
key
not
in
gpu_dnn_conv_gradI
.
cache
:
gpu_dnn_conv_gradI
.
cache
[
key
]
=
GpuDnnConvGradI
(
inplace
,
algo
)
return
gpu_dnn_conv_gradI
.
cache
[
key
]
gpu_dnn_conv_gradI
.
cache
=
{}
def
dnn_conv
(
img
,
kerns
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
def
dnn_conv
(
img
,
kerns
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
conv_mode
=
'conv'
,
direction_hint
=
None
,
workmem
=
None
,
conv_mode
=
'conv'
,
direction_hint
=
None
,
workmem
=
None
,
algo
=
None
,
precision
=
None
):
algo
=
None
,
precision
=
None
):
...
@@ -896,12 +937,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...
@@ -896,12 +937,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns
=
gpu_contiguous
(
kerns
.
dimshuffle
(
1
,
0
,
2
,
3
))
kerns
=
gpu_contiguous
(
kerns
.
dimshuffle
(
1
,
0
,
2
,
3
))
shape2
=
shape_i
(
img
,
2
,
fgraph
)
-
shape_i
(
kerns
,
2
,
fgraph
)
+
1
shape2
=
shape_i
(
img
,
2
,
fgraph
)
-
shape_i
(
kerns
,
2
,
fgraph
)
+
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
-
shape_i
(
kerns
,
3
,
fgraph
)
+
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
-
shape_i
(
kerns
,
3
,
fgraph
)
+
1
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_nam
e
)(
out
=
gpu_alloc_empty
(
ctx_name
,
dtype
=
img
.
dtyp
e
)(
shape_i
(
kerns
,
1
,
fgraph
),
shape_i
(
kerns
,
1
,
fgraph
),
shape_i
(
img
,
1
,
fgraph
),
shape2
,
shape3
)
shape_i
(
img
,
1
,
fgraph
),
shape2
,
shape3
)
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
conv_mode
=
'cross'
,
precision
=
precision
)(
out
.
shape
)
conv_mode
=
'cross'
,
precision
=
precision
)(
out
.
shape
)
conv
=
GpuDnnConvG
radW
()(
img
,
kerns
,
out
,
desc
)
conv
=
gpu_dnn_conv_g
radW
()(
img
,
kerns
,
out
,
desc
)
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
),
ctx_name
)
return
as_gpuarray_variable
(
conv
.
dimshuffle
(
1
,
0
,
2
,
3
),
ctx_name
)
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
)
and
elif
(
border_mode
==
'full'
and
subsample
==
(
1
,
1
)
and
...
@@ -914,26 +955,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
...
@@ -914,26 +955,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode
=
'cross'
if
conv_mode
==
'conv'
else
'conv'
conv_mode
=
'cross'
if
conv_mode
==
'conv'
else
'conv'
shape2
=
shape_i
(
img
,
2
,
fgraph
)
+
shape_i
(
kerns
,
2
,
fgraph
)
-
1
shape2
=
shape_i
(
img
,
2
,
fgraph
)
+
shape_i
(
kerns
,
2
,
fgraph
)
-
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
+
shape_i
(
kerns
,
3
,
fgraph
)
-
1
shape3
=
shape_i
(
img
,
3
,
fgraph
)
+
shape_i
(
kerns
,
3
,
fgraph
)
-
1
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_nam
e
)(
shape_i
(
img
,
0
,
fgraph
),
out
=
gpu_alloc_empty
(
ctx_name
,
dtype
=
img
.
dtyp
e
)(
shape_i
(
img
,
0
,
fgraph
),
shape_i
(
kerns
,
1
,
fgraph
),
shape_i
(
kerns
,
1
,
fgraph
),
shape2
,
shape3
)
shape2
,
shape3
)
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
desc
=
GpuDnnConvDesc
(
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
return
GpuDnnConvG
radI
()(
kerns
,
img
,
out
,
desc
)
return
gpu_dnn_conv_g
radI
()(
kerns
,
img
,
out
,
desc
)
# Standard case: We use GpuDnnConv with suitable padding.
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
# if the img contains negative strides
img
=
gpu_contiguous
(
img
)
img
=
gpu_contiguous
(
img
)
kerns
=
gpu_contiguous
(
kerns
)
kerns
=
gpu_contiguous
(
kerns
)
desc
=
GpuDnnConvD
esc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
desc
=
gpu_dnn_conv_d
esc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
conv_mode
=
conv_mode
,
precision
=
precision
)(
kerns
.
shape
)
desc_op
=
desc
.
owner
.
op
desc_op
=
desc
.
owner
.
op
out_shp
=
GpuDnnConv
.
get_out_shape
(
img
.
shape
,
kerns
.
shape
,
# We can use Shape_i and bypass the infer_shape here as this is on
desc_op
.
border_mode
,
# the input of node and it will always be present.
desc_op
.
subsample
)
ishape
=
[
shape_i_op
(
i
)(
img
)
for
i
in
range
(
img
.
ndim
)]
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_name
)(
*
out_shp
)
kshape
=
[
shape_i_op
(
i
)(
kerns
)
for
i
in
range
(
kerns
.
ndim
)]
return
GpuDnnConv
(
algo
=
algo
)(
img
,
kerns
,
out
,
desc
)
out_shp
=
get_conv_output_shape
(
ishape
,
kshape
,
desc_op
.
border_mode
,
desc_op
.
subsample
)
out
=
gpu_alloc_empty
(
ctx_name
,
dtype
=
img
.
dtype
)(
*
out_shp
)
return
gpu_dnn_conv
(
algo
=
algo
)(
img
,
kerns
,
out
,
desc
)
def
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
def
dnn_gradweight
(
img
,
topgrad
,
kerns_shp
,
border_mode
=
'valid'
,
...
@@ -944,10 +989,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
...
@@ -944,10 +989,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
img
=
gpu_contiguous
(
img
)
img
=
gpu_contiguous
(
img
)
topgrad
=
gpu_contiguous
(
topgrad
)
topgrad
=
gpu_contiguous
(
topgrad
)
kerns_shp
=
as_tensor_variable
(
kerns_shp
)
kerns_shp
=
as_tensor_variable
(
kerns_shp
)
desc
=
GpuDnnConvD
esc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
desc
=
gpu_dnn_conv_d
esc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
conv_mode
=
conv_mode
)(
kerns_shp
)
conv_mode
=
conv_mode
)(
kerns_shp
)
out
=
GpuAllocEmpty
(
img
.
dtype
,
ctx_nam
e
)(
*
kerns_shp
)
out
=
gpu_alloc_empty
(
ctx_name
,
dtype
=
img
.
dtyp
e
)(
*
kerns_shp
)
return
GpuDnnConvG
radW
()(
img
,
topgrad
,
out
,
desc
)
return
gpu_dnn_conv_g
radW
()(
img
,
topgrad
,
out
,
desc
)
def
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
def
dnn_gradinput
(
kerns
,
topgrad
,
img_shp
,
border_mode
=
'valid'
,
...
@@ -958,10 +1003,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
...
@@ -958,10 +1003,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
kerns
=
gpu_contiguous
(
kerns
)
kerns
=
gpu_contiguous
(
kerns
)
topgrad
=
gpu_contiguous
(
topgrad
)
topgrad
=
gpu_contiguous
(
topgrad
)
img_shp
=
as_tensor_variable
(
img_shp
)
img_shp
=
as_tensor_variable
(
img_shp
)
desc
=
GpuDnnConvD
esc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
desc
=
gpu_dnn_conv_d
esc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
conv_mode
=
conv_mode
)(
kerns
.
shape
)
conv_mode
=
conv_mode
)(
kerns
.
shape
)
out
=
GpuAllocEmpty
(
kerns
.
dtype
,
ctx_nam
e
)(
*
img_shp
)
out
=
gpu_alloc_empty
(
ctx_name
,
kerns
.
dtyp
e
)(
*
img_shp
)
return
GpuDnnConvG
radI
()(
kerns
,
topgrad
,
out
,
desc
)
return
gpu_dnn_conv_g
radI
()(
kerns
,
topgrad
,
out
,
desc
)
class
GpuDnnPoolDesc
(
Op
):
class
GpuDnnPoolDesc
(
Op
):
...
@@ -1382,53 +1427,59 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
...
@@ -1382,53 +1427,59 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return
Apply
(
self
,
[
dy
,
sm
],
[
sm
.
type
()])
return
Apply
(
self
,
[
dy
,
sm
],
[
sm
.
type
()])
@local_optimizer
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
@register_opt2
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
])
AbstractConv2d_gradInputs
],
'fast_compile'
,
'conv_dnn'
,
'cudnn'
)
def
local_abstractconv_cudnn
(
node
):
def
local_abstractconv_cudnn_graph
(
op
,
context_name
,
inputs
,
outputs
):
if
(
not
isinstance
(
node
.
op
,
(
AbstractConv2d
,
if
(
not
isinstance
(
op
,
(
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
))):
AbstractConv2d_gradInputs
))):
return
None
return
inp1
=
node
.
inputs
[
0
]
inp2
=
node
.
inputs
[
1
]
if
(
node
.
op
.
filter_dilation
!=
(
1
,
1
)):
if
(
op
.
filter_dilation
!=
(
1
,
1
)):
return
None
return
None
i
f
not
isinstance
(
inp1
.
type
,
GpuArrayType
):
i
np1
=
inputs
[
0
]
return
None
inp2
=
inputs
[
1
]
if
not
dnn_available
(
inp1
.
type
.
context_name
):
if
not
dnn_available
(
inp1
.
type
.
context_name
):
raise_no_cudnn
()
raise_no_cudnn
()
if
node
.
op
.
filter_flip
:
if
op
.
filter_flip
:
conv_mode
=
'conv'
conv_mode
=
'conv'
else
:
else
:
conv_mode
=
'cross'
conv_mode
=
'cross'
if
isinstance
(
node
.
op
,
AbstractConv2d
):
if
isinstance
(
op
,
AbstractConv2d
):
rval
=
dnn_conv
(
inp1
,
inp2
,
rval
=
dnn_conv
(
inp1
,
inp2
,
border_mode
=
node
.
op
.
border_mode
,
border_mode
=
op
.
border_mode
,
subsample
=
node
.
op
.
subsample
,
subsample
=
op
.
subsample
,
direction_hint
=
'forward!'
,
direction_hint
=
'forward!'
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
)
if
isinstance
(
node
.
op
,
AbstractConv2d_gradWeights
):
elif
isinstance
(
op
,
AbstractConv2d_gradWeights
):
shape
=
(
inp2
.
shape
[
1
],
inp1
.
shape
[
1
],
shape
=
(
inp2
.
shape
[
1
],
inp1
.
shape
[
1
],
node
.
inputs
[
2
][
0
],
node
.
inputs
[
2
][
1
])
inputs
[
2
][
0
],
inputs
[
2
][
1
])
rval
=
dnn_gradweight
(
inp1
,
inp2
,
shape
,
rval
=
dnn_gradweight
(
inp1
,
inp2
,
shape
,
border_mode
=
node
.
op
.
border_mode
,
border_mode
=
op
.
border_mode
,
subsample
=
node
.
op
.
subsample
,
subsample
=
op
.
subsample
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
)
if
isinstance
(
node
.
op
,
AbstractConv2d_gradInputs
):
elif
isinstance
(
op
,
AbstractConv2d_gradInputs
):
shape
=
(
inp2
.
shape
[
0
],
inp1
.
shape
[
1
],
shape
=
(
inp2
.
shape
[
0
],
inp1
.
shape
[
1
],
node
.
inputs
[
2
][
0
],
node
.
inputs
[
2
][
1
])
inputs
[
2
][
0
],
inputs
[
2
][
1
])
rval
=
dnn_gradinput
(
inp1
,
inp2
,
shape
,
rval
=
dnn_gradinput
(
inp1
,
inp2
,
shape
,
border_mode
=
node
.
op
.
border_mode
,
border_mode
=
op
.
border_mode
,
subsample
=
node
.
op
.
subsample
,
subsample
=
op
.
subsample
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
)
return
[
rval
]
return
[
rval
]
@local_optimizer
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
])
def
local_abstractconv_cudnn
(
node
):
ctx
=
infer_context_name
(
*
node
.
inputs
)
if
not
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
conv_groupopt
.
register
(
'local_abstractconv_cudnn'
,
conv_groupopt
.
register
(
'local_abstractconv_cudnn'
,
local_abstractconv_cudnn
,
20
,
local_abstractconv_cudnn
,
20
,
'fast_compile'
,
'fast_run'
,
'fast_compile'
,
'fast_run'
,
...
@@ -1437,17 +1488,17 @@ conv_groupopt.register('local_abstractconv_cudnn',
...
@@ -1437,17 +1488,17 @@ conv_groupopt.register('local_abstractconv_cudnn',
@inplace_allocempty
(
GpuDnnConv
,
2
)
@inplace_allocempty
(
GpuDnnConv
,
2
)
def
local_dnn_conv_inplace
(
node
,
inputs
):
def
local_dnn_conv_inplace
(
node
,
inputs
):
return
[
GpuDnnC
onv
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
return
[
gpu_dnn_c
onv
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
@inplace_allocempty
(
GpuDnnConvGradW
,
2
)
@inplace_allocempty
(
GpuDnnConvGradW
,
2
)
def
local_dnn_convgw_inplace
(
node
,
inputs
):
def
local_dnn_convgw_inplace
(
node
,
inputs
):
return
[
GpuDnnConvG
radW
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
return
[
gpu_dnn_conv_g
radW
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
@inplace_allocempty
(
GpuDnnConvGradI
,
2
)
@inplace_allocempty
(
GpuDnnConvGradI
,
2
)
def
local_dnn_convgi_inplace
(
node
,
inputs
):
def
local_dnn_convgi_inplace
(
node
,
inputs
):
return
[
GpuDnnConvG
radI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
return
[
gpu_dnn_conv_g
radI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
optdb
.
register
(
'local_dnna_conv_inplace'
,
optdb
.
register
(
'local_dnna_conv_inplace'
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
...
@@ -1460,73 +1511,75 @@ optdb.register('local_dnna_conv_inplace',
...
@@ -1460,73 +1511,75 @@ optdb.register('local_dnna_conv_inplace',
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
)
@alpha_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_conv_alpha_merge
(
node
,
*
inputs
):
def
local_dnn_conv_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnC
onv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
gpu_dnn_c
onv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
)
@alpha_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_convw_alpha_merge
(
node
,
*
inputs
):
def
local_dnn_convw_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConvG
radW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
gpu_dnn_conv_g
radW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
)
@alpha_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_convi_alpha_merge
(
node
,
*
inputs
):
def
local_dnn_convi_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConvG
radI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
gpu_dnn_conv_g
radI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
@output_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_conv_output_merge
(
node
,
*
inputs
):
def
local_dnn_conv_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnC
onv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
gpu_dnn_c
onv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
@output_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_convw_output_merge
(
node
,
*
inputs
):
def
local_dnn_convw_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConvG
radW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
gpu_dnn_conv_g
radW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
@output_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_convi_output_merge
(
node
,
*
inputs
):
def
local_dnn_convi_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConvG
radI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
gpu_dnn_conv_g
radI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
,
'fast_compile'
)
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
Pool
])
@op_lifter
([
Pool
])
def
local_pool_dnn_alternative
(
node
,
ctx_name
):
@register_opt2
([
Pool
],
'fast_compile'
,
'cudnn'
)
def
local_gpua_pool_dnn_alternative
(
op
,
ctx_name
,
inputs
,
outputs
):
if
not
dnn_available
(
ctx_name
):
if
not
dnn_available
(
ctx_name
):
raise_no_cudnn
()
raise_no_cudnn
()
if
not
node
.
op
.
ignore_border
:
if
not
op
.
ignore_border
:
return
return
img
,
=
node
.
inputs
img
,
=
inputs
img
=
as_gpuarray_variable
(
img
,
ctx_name
)
img
=
as_gpuarray_variable
(
img
,
ctx_name
)
ds
=
node
.
op
.
ds
ds
=
op
.
ds
stride
=
node
.
op
.
st
stride
=
op
.
st
pad
=
node
.
op
.
padding
pad
=
op
.
padding
mode
=
node
.
op
.
mode
mode
=
op
.
mode
return
dnn_pool
(
gpu_contiguous
(
img
),
ds
,
stride
=
stride
,
pad
=
pad
,
mode
=
mode
)
return
dnn_pool
(
gpu_contiguous
(
img
),
ds
,
stride
=
stride
,
pad
=
pad
,
mode
=
mode
)
@register_opt
(
'cudnn'
,
'fast_compile'
)
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
MaxPoolGrad
])
@op_lifter
([
MaxPoolGrad
])
def
local_pool_dnn_grad_stride
(
node
,
ctx_name
):
@register_opt2
([
MaxPoolGrad
],
'fast_compile'
,
'cudnn'
)
def
local_gpua_pool_dnn_grad_stride
(
op
,
ctx_name
,
inputs
,
outputs
):
if
not
dnn_available
(
ctx_name
):
if
not
dnn_available
(
ctx_name
):
raise_no_cudnn
()
raise_no_cudnn
()
if
not
node
.
op
.
ignore_border
:
if
not
op
.
ignore_border
:
return
return
inp
,
out
,
out_grad
=
node
.
inputs
inp
,
out
,
out_grad
=
inputs
inp
=
as_gpuarray_variable
(
inp
,
ctx_name
)
inp
=
as_gpuarray_variable
(
inp
,
ctx_name
)
out
=
as_gpuarray_variable
(
out
,
ctx_name
)
out
=
as_gpuarray_variable
(
out
,
ctx_name
)
out_grad
=
as_gpuarray_variable
(
out_grad
,
ctx_name
)
out_grad
=
as_gpuarray_variable
(
out_grad
,
ctx_name
)
ds
=
node
.
op
.
ds
ds
=
op
.
ds
st
=
node
.
op
.
st
st
=
op
.
st
pad
=
node
.
op
.
padding
pad
=
op
.
padding
mode
=
node
.
op
.
mode
mode
=
op
.
mode
return
GpuDnnPoolGrad
(
mode
=
mode
)(
gpu_contiguous
(
inp
),
return
GpuDnnPoolGrad
(
mode
=
mode
)(
gpu_contiguous
(
inp
),
gpu_contiguous
(
out
),
gpu_contiguous
(
out
),
...
@@ -1538,18 +1591,19 @@ def local_pool_dnn_grad_stride(node, ctx_name):
...
@@ -1538,18 +1591,19 @@ def local_pool_dnn_grad_stride(node, ctx_name):
@register_opt
(
'cudnn'
,
'fast_compile'
)
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
AveragePoolGrad
])
@op_lifter
([
AveragePoolGrad
])
def
local_avg_pool_dnn_grad_stride
(
node
,
ctx_name
):
@register_opt2
([
AveragePoolGrad
],
'fast_compile'
,
'cudnn'
)
def
local_gpua_avg_pool_dnn_grad_stride
(
op
,
ctx_name
,
inputs
,
outputs
):
if
not
dnn_available
(
ctx_name
):
if
not
dnn_available
(
ctx_name
):
raise_no_cudnn
()
raise_no_cudnn
()
if
not
node
.
op
.
ignore_border
:
if
not
op
.
ignore_border
:
return
return
inp
,
out_grad
=
node
.
inputs
inp
,
out_grad
=
inputs
inp
=
as_gpuarray_variable
(
inp
,
ctx_name
)
inp
=
as_gpuarray_variable
(
inp
,
ctx_name
)
out_grad
=
as_gpuarray_variable
(
out_grad
,
ctx_name
)
out_grad
=
as_gpuarray_variable
(
out_grad
,
ctx_name
)
ds
=
node
.
op
.
ds
ds
=
op
.
ds
st
=
node
.
op
.
st
st
=
op
.
st
pad
=
node
.
op
.
padding
pad
=
op
.
padding
mode
=
node
.
op
.
mode
mode
=
op
.
mode
cg
=
gpu_contiguous
(
out_grad
)
cg
=
gpu_contiguous
(
out_grad
)
...
@@ -1591,9 +1645,10 @@ def local_log_softmax_dnn(node):
...
@@ -1591,9 +1645,10 @@ def local_log_softmax_dnn(node):
@register_opt
(
'cudnn'
,
'fast_compile'
)
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
LogSoftmax
])
@op_lifter
([
LogSoftmax
])
def
local_logsoftmax_to_dnn
(
node
,
ctx_name
):
@register_opt2
([
LogSoftmax
],
'fast_compile'
,
'cudnn'
)
def
local_gpua_logsoftmax_to_dnn
(
op
,
ctx_name
,
inputs
,
outputs
):
# Transform the input in the format expected by GpuDnnSoftmax
# Transform the input in the format expected by GpuDnnSoftmax
inp
=
node
.
inputs
[
0
]
inp
=
inputs
[
0
]
if
inp
.
ndim
!=
2
:
if
inp
.
ndim
!=
2
:
return
return
if
not
dnn_available
(
ctx_name
)
or
version
(
raises
=
False
)
<
3000
:
if
not
dnn_available
(
ctx_name
)
or
version
(
raises
=
False
)
<
3000
:
...
@@ -1629,11 +1684,12 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
...
@@ -1629,11 +1684,12 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt
(
'cudnn'
,
'fast_compile'
)
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
SoftmaxGrad
])
@op_lifter
([
SoftmaxGrad
])
def
local_softmax_dnn_grad
(
node
,
ctx_name
):
@register_opt2
([
SoftmaxGrad
],
'cudnn'
,
'fast_compile'
)
def
local_gpua_softmax_dnn_grad
(
op
,
ctx_name
,
inputs
,
outputs
):
if
not
dnn_available
(
ctx_name
):
if
not
dnn_available
(
ctx_name
):
raise_no_cudnn
(
"cuDNN needed for SoftmaxGrad"
)
raise_no_cudnn
(
"cuDNN needed for SoftmaxGrad"
)
ins
=
[]
ins
=
[]
for
n
in
node
.
inputs
:
for
n
in
inputs
:
n
=
as_gpuarray_variable
(
n
,
ctx_name
)
n
=
as_gpuarray_variable
(
n
,
ctx_name
)
if
n
.
ndim
!=
2
:
if
n
.
ndim
!=
2
:
return
return
...
...
theano/gpuarray/elemwise.py
浏览文件 @
a24fd9bb
...
@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return
kernels
return
kernels
# Caching GpuCAReduceCuda
def
gpu_ca_reduce_cuda
(
scalar_op
,
axis
=
None
,
reduce_mask
=
None
,
dtype
=
None
,
acc_dtype
=
None
,
pre_scalar_op
=
None
):
key
=
(
scalar_op
,
axis
,
reduce_mask
,
dtype
,
acc_dtype
,
pre_scalar_op
)
if
key
not
in
gpu_ca_reduce_cuda
.
cache
:
gpu_ca_reduce_cuda
.
cache
[
key
]
=
GpuCAReduceCuda
(
scalar_op
,
axis
,
reduce_mask
,
dtype
,
acc_dtype
,
pre_scalar_op
)
return
gpu_ca_reduce_cuda
.
cache
[
key
]
gpu_ca_reduce_cuda
.
cache
=
{}
class
GpuCAReduceCPY
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
class
GpuCAReduceCPY
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
"""
"""
CAReduce that reuse the python code from gpuarray.
CAReduce that reuse the python code from gpuarray.
...
...
theano/gpuarray/extra_ops.py
浏览文件 @
a24fd9bb
...
@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
...
@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
import
os
import
os
from
theano
import
Apply
,
Op
from
theano
import
Apply
,
Op
from
theano.tensor.extra_ops
import
CumsumOp
from
theano.tensor.extra_ops
import
CumsumOp
from
.basic_ops
import
infer_context_name
try
:
try
:
from
pygpu
import
gpuarray
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
GpuReshape
)
infer_context_name
,
GpuFromHost
)
from
.opt
import
register_opt
,
op_lifter
,
register_opt2
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
class
GpuCumsum
(
GpuKernelBase
,
Op
):
class
GpuCumsum
(
GpuKernelBase
,
Op
):
...
@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
...
@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumSum"
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumSum"
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
))
context_name
=
infer_context_name
(
x
)
x
=
as_gpuarray_variable
(
x
,
context_name
)
if
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
if
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
raise
NotImplementedError
(
'Only cumsum on 1D, 2D and
\
raise
NotImplementedError
(
'Only cumsum on 1D, 2D and
\
...
@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
...
@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
return
super
(
GpuCumsum
,
self
)
.
c_support_code_struct
(
node
,
nodename
)
+
code
return
super
(
GpuCumsum
,
self
)
.
c_support_code_struct
(
node
,
nodename
)
+
code
@register_opt
(
'fast_compile'
)
@op_lifter
([
CumsumOp
])
@op_lifter
([
CumsumOp
])
def
use_gpu_cumsumop
(
node
,
ctx_name
):
@register_opt2
([
CumsumOp
],
'fast_compile'
)
if
node
.
inputs
[
0
]
.
dtype
==
'float32'
:
def
local_gpua_cumsumop
(
op
,
ctx_name
,
inputs
,
outputs
)
:
axis
=
node
.
op
.
axis
if
inputs
[
0
]
.
dtype
==
'float32'
:
x
=
node
.
inputs
[
0
]
axis
=
op
.
axis
x
=
inputs
[
0
]
if
axis
is
not
None
and
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
if
axis
is
not
None
and
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
return
None
return
None
if
axis
is
None
and
x
.
ndim
>
1
:
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
x
=
x
.
flatten
()
x
=
GpuFromHost
(
ctx_name
)(
x
)
if
axis
is
None
and
x
.
ndim
>
1
:
x
=
GpuReshape
(
1
)(
x
,
(
-
1
,))
# ``gpu_cumsum`` assume array has been flattened if needed.
# ``gpu_cumsum`` assume array has been flattened if needed.
if
axis
is
None
:
if
axis
is
None
:
axis
=
0
axis
=
0
return
GpuCumsum
(
axis
)(
x
)
return
GpuCumsum
(
axis
)(
x
)
register_gpu_opt
()(
use_gpu_cumsumop
)
theano/gpuarray/fft.py
浏览文件 @
a24fd9bb
...
@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
...
@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
from
theano.gpuarray
import
(
basic_ops
,
GpuArrayType
)
from
theano.gpuarray
import
(
basic_ops
,
GpuArrayType
)
import
theano.tensor.fft
import
theano.tensor.fft
from
.opt
import
register_opt
,
op_lifter
from
.opt
import
register_opt
,
op_lifter
,
register_opt2
try
:
try
:
import
pygpu
import
pygpu
...
@@ -373,10 +373,12 @@ def _unitary(norm):
...
@@ -373,10 +373,12 @@ def _unitary(norm):
if
scikits_cuda_available
:
if
scikits_cuda_available
:
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
tensor
.
fft
.
RFFTOp
])
@op_lifter
([
theano
.
tensor
.
fft
.
RFFTOp
])
def
local_curfft_op
(
node
,
context_name
):
@register_opt2
([
theano
.
tensor
.
fft
.
RFFTOp
],
'fast_compile'
)
def
local_gpua_curfft_op
(
op
,
ctx_name
,
inputs
,
outputs
):
return
curfft_op
return
curfft_op
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
tensor
.
fft
.
IRFFTOp
])
@op_lifter
([
theano
.
tensor
.
fft
.
IRFFTOp
])
def
local_cuirfft_op
(
node
,
context_name
):
@register_opt2
([
theano
.
tensor
.
fft
.
IRFFTOp
],
'fast_compile'
)
def
local_gpua_cuirfft_op
(
op
,
ctx_name
,
inputs
,
outputs
):
return
cuirfft_op
return
cuirfft_op
theano/gpuarray/multinomial.py
浏览文件 @
a24fd9bb
...
@@ -14,7 +14,7 @@ from theano.gof import Op
...
@@ -14,7 +14,7 @@ from theano.gof import Op
from
theano.tensor
import
NotScalarConstantError
,
get_scalar_constant_value
from
theano.tensor
import
NotScalarConstantError
,
get_scalar_constant_value
from
theano
import
gpuarray
from
theano
import
gpuarray
from
.basic_ops
import
as_gpuarray_variable
,
infer_context_name
from
.basic_ops
import
as_gpuarray_variable
,
infer_context_name
from
.opt
import
register_opt
,
op_lifter
from
.opt
import
register_opt
,
op_lifter
,
register_opt2
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
...
@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
...
@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
return
(
1
,)
return
(
1
,)
@register_opt
()
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
sandbox
.
multinomial
.
MultinomialFromUniform
])
@op_lifter
([
theano
.
sandbox
.
multinomial
.
MultinomialFromUniform
])
def
local_gpua_multinomial
(
node
,
context_name
):
@register_opt2
([
theano
.
sandbox
.
multinomial
.
MultinomialFromUniform
],
'fast_compile'
)
def
local_gpua_multinomial
(
op
,
context_name
,
inputs
,
outputs
):
# TODO : need description for function
# TODO : need description for function
if
len
(
node
.
inputs
)
==
2
:
if
len
(
inputs
)
==
2
:
p
,
u
=
node
.
inputs
p
,
u
=
inputs
n_samples
=
1
n_samples
=
1
else
:
else
:
p
,
u
,
n_samples
=
node
.
inputs
p
,
u
,
n_samples
=
inputs
try
:
try
:
if
get_scalar_constant_value
(
n_samples
)
!=
1
:
if
get_scalar_constant_value
(
n_samples
)
!=
1
:
return
None
return
None
except
NotScalarConstantError
:
except
NotScalarConstantError
:
return
None
return
None
m
,
=
node
.
outputs
m
,
=
outputs
if
(
p
.
dtype
==
u
.
dtype
==
m
.
dtype
==
'float32'
):
if
(
p
.
dtype
==
u
.
dtype
==
m
.
dtype
==
'float32'
):
gpu_op
=
GPUAMultinomialFromUniform
(
node
.
op
.
odtype
)
gpu_op
=
GPUAMultinomialFromUniform
(
op
.
odtype
)
return
gpuarray
.
elemwise
.
GpuDimShuffle
([
False
,
False
],
[
1
,
0
])(
return
gpuarray
.
elemwise
.
GpuDimShuffle
([
False
,
False
],
[
1
,
0
])(
gpu_op
(
p
,
u
))
gpu_op
(
p
,
u
))
theano/gpuarray/neighbours.py
浏览文件 @
a24fd9bb
...
@@ -13,7 +13,7 @@ except ImportError:
...
@@ -13,7 +13,7 @@ except ImportError:
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
infer_context_name
)
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.opt
import
register_opt
2
,
op_lifter
,
register_opt
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
...
@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Op
.
perform
(
self
,
node
,
inp
,
out
,
ctx
)
Op
.
perform
(
self
,
node
,
inp
,
out
,
ctx
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
Images2Neibs
])
@op_lifter
([
Images2Neibs
])
def
use_gpu_images2neibs
(
node
,
context_name
):
@register_opt2
([
Images2Neibs
],
'fast_compile'
)
if
node
.
op
.
mode
in
[
'valid'
,
'ignore_borders'
,
'wrap_centered'
]:
def
local_gpua_images2neibs
(
op
,
context_name
,
inputs
,
outputs
):
return
GpuImages2Neibs
(
node
.
op
.
mode
)
if
op
.
mode
in
[
'valid'
,
'ignore_borders'
,
'wrap_centered'
]:
return
GpuImages2Neibs
(
op
.
mode
)
register_gpu_opt
()(
use_gpu_images2neibs
)
theano/gpuarray/nerv.py
浏览文件 @
a24fd9bb
...
@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
...
@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
from
.
import
opt
from
.
import
opt
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuAllocEmpty
,
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuAllocEmpty
,
infer_context_name
)
infer_context_name
,
gpu_alloc_empty
)
from
.type
import
gpu_context_type
from
.type
import
gpu_context_type
from
.opt_util
import
alpha_merge
,
output_merge
from
.opt_util
import
alpha_merge
,
output_merge
...
@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
...
@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
return
'
\n
'
.
join
(
codel
)
return
'
\n
'
.
join
(
codel
)
@opt.register_opt
()
@opt.register_opt
(
'fast_compile'
)
@opt.op_lifter
([
tensor
.
Dot
])
@opt.op_lifter
([
tensor
.
Dot
])
def
local_dot_to_gemm16
(
node
,
ctx_name
):
@opt.register_opt2
([
tensor
.
Dot
],
'fast_compile'
)
def
local_gpua_dot_to_gemm16
(
op
,
ctx_name
,
inputs
,
outputs
):
if
nerv
is
None
:
if
nerv
is
None
:
return
return
A
=
node
.
inputs
[
0
]
A
=
inputs
[
0
]
B
=
node
.
inputs
[
1
]
B
=
inputs
[
1
]
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
fgraph
=
node
.
inputs
[
0
]
.
fgraph
fgraph
=
getattr
(
outputs
[
0
],
'fgraph'
,
None
)
C
=
GpuAllocEmpty
(
dtype
=
'float16'
,
context_name
=
ctx_name
)(
C
=
gpu_alloc_empty
(
ctx_name
,
dtype
=
'float16'
)(
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
return
Gemm16
()(
C
,
1.0
,
A
,
B
,
0.0
)
return
Gemm16
()(
C
,
1.0
,
A
,
B
,
0.0
)
...
...
theano/gpuarray/opt.py
浏览文件 @
a24fd9bb
...
@@ -3,6 +3,8 @@ import copy
...
@@ -3,6 +3,8 @@ import copy
import
numpy
import
numpy
import
logging
import
logging
import
pdb
import
pdb
import
time
from
six
import
iteritems
from
six.moves
import
xrange
from
six.moves
import
xrange
import
theano
import
theano
...
@@ -10,9 +12,11 @@ from theano import tensor, scalar, gof, config
...
@@ -10,9 +12,11 @@ from theano import tensor, scalar, gof, config
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.compile.ops
import
shape_i
from
theano.compile.ops
import
shape_i
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
TopoOptimizer
,
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
TopoOptimizer
,
SequenceDB
,
Optimizer
,
toolbox
)
SequenceDB
,
Optimizer
,
DB
,
toolbox
,
graph
)
from
theano.gof.opt
import
NavigatorOptimizer
from
theano.gof.optdb
import
LocalGroupDB
from
theano.gof.optdb
import
LocalGroupDB
from
theano.ifelse
import
IfElse
from
theano.ifelse
import
IfElse
from
theano.misc.ordered_set
import
OrderedSet
from
theano.scalar.basic
import
Scalar
,
Pow
,
Cast
from
theano.scalar.basic
import
Scalar
,
Pow
,
Cast
from
theano.scan_module
import
scan_utils
,
scan_op
,
scan_opt
from
theano.scan_module
import
scan_utils
,
scan_op
,
scan_opt
...
@@ -32,7 +36,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
...
@@ -32,7 +36,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu
,
GpuFromHost
,
HostFromGpu
,
GpuFromHost
,
GpuSplit
,
GpuContiguous
,
gpu_contiguous
,
GpuSplit
,
GpuContiguous
,
gpu_contiguous
,
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuEye
,
gpu_join
,
GpuJoin
)
GpuEye
,
gpu_join
,
GpuJoin
,
gpu_alloc_empty
,
gpu_alloc
,
gpu_from_host
)
from
.blas
import
(
gpu_dot22
,
GpuGemm
,
GpuGer
,
GpuGemmBatch
,
from
.blas
import
(
gpu_dot22
,
GpuGemm
,
GpuGer
,
GpuGemmBatch
,
gpugemm_no_inplace
,
gpugemm_inplace
,
gpugemmbatch_no_inplace
,
gpugemm_no_inplace
,
gpugemm_inplace
,
gpugemmbatch_no_inplace
,
gpugemv_no_inplace
,
gpugemv_inplace
)
gpugemv_no_inplace
,
gpugemv_inplace
)
...
@@ -44,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
...
@@ -44,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias
,
gpu_softmax
)
gpu_softmax_with_bias
,
gpu_softmax
)
from
.elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
GpuCAReduceCuda
,
from
.elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
GpuCAReduceCuda
,
GpuCAReduceCPY
)
GpuCAReduceCPY
,
gpu_ca_reduce_cuda
)
from
.subtensor
import
(
GpuIncSubtensor
,
GpuSubtensor
,
from
.subtensor
import
(
GpuIncSubtensor
,
GpuSubtensor
,
GpuAdvancedSubtensor1
,
GpuAdvancedSubtensor1
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1
,
...
@@ -57,12 +61,31 @@ _logger = logging.getLogger("theano.gpuarray.opt")
...
@@ -57,12 +61,31 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer
=
EquilibriumDB
()
gpu_optimizer
=
EquilibriumDB
()
gpu_cut_copies
=
EquilibriumDB
()
gpu_cut_copies
=
EquilibriumDB
()
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
gpu_optimizer2
=
EquilibriumDB
()
class
GraphToGPUDB
(
DB
):
"""
Retrieves the list local optimizers based on the optimizer flag's value
from EquilibriumOptimizer by calling the method query.
"""
def
query
(
self
,
*
tags
,
**
kwtags
):
opt
=
gpu_optimizer2
.
query
(
*
tags
,
**
kwtags
)
return
GraphToGPU
(
opt
.
local_optimizers_all
,
opt
.
local_optimizers_map
)
gpu_seqopt
=
SequenceDB
()
gpu_seqopt
=
SequenceDB
()
# Don't register this right now
# Don't register this right now
conv_groupopt
=
LocalGroupDB
()
conv_groupopt
=
LocalGroupDB
()
conv_groupopt
.
__name__
=
"gpua_conv_opts"
conv_groupopt
.
__name__
=
"gpua_conv_opts"
gpu_seqopt
.
register
(
'gpuarray_graph_optimization'
,
GraphToGPUDB
(),
-
0.5
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
gpu_seqopt
.
register
(
'gpuarray_local_optimiziations'
,
gpu_optimizer
,
1
,
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
'fast_compile'
,
'fast_run'
,
'gpuarray'
)
gpu_seqopt
.
register
(
'gpuarray_cut_transfers'
,
gpu_cut_copies
,
2
,
gpu_seqopt
.
register
(
'gpuarray_cut_transfers'
,
gpu_cut_copies
,
2
,
...
@@ -82,6 +105,28 @@ def register_opt(*tags, **kwargs):
...
@@ -82,6 +105,28 @@ def register_opt(*tags, **kwargs):
return
f
return
f
def
register_opt2
(
tracks
,
*
tags
,
**
kwargs
):
'''
Decorator for the new GraphToGPU optimizer.
Takes an extra parameter(Op) compared to register_opt decorator.
Parameters
----------
tracks : List of Op class Or Op instance or None
The Node's Op to which optimization is being applied.
tags : String
The optimization tag to which the optimizer will be registered.
'''
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
opt
=
theano
.
gof
.
local_optimizer
(
tracks
)(
local_opt
)
gpu_optimizer2
.
register
(
name
,
opt
,
'fast_run'
,
'gpuarray'
,
*
tags
)
return
local_opt
return
f
def
register_inplace
(
*
tags
,
**
kwargs
):
def
register_inplace
(
*
tags
,
**
kwargs
):
def
f
(
local_opt
):
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
...
@@ -102,7 +147,7 @@ gpu_optimizer.register('local_remove_all_assert',
...
@@ -102,7 +147,7 @@ gpu_optimizer.register('local_remove_all_assert',
def
safe_to_gpu
(
x
,
ctx_name
):
def
safe_to_gpu
(
x
,
ctx_name
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
GpuFromH
ost
(
ctx_name
)(
x
)
return
gpu_from_h
ost
(
ctx_name
)(
x
)
else
:
else
:
return
x
return
x
...
@@ -135,6 +180,7 @@ def op_lifter(OP, cuda_only=False):
...
@@ -135,6 +180,7 @@ def op_lifter(OP, cuda_only=False):
context_name
=
i
.
owner
.
inputs
[
0
]
.
type
.
context_name
context_name
=
i
.
owner
.
inputs
[
0
]
.
type
.
context_name
replace
=
True
replace
=
True
break
break
if
not
replace
:
if
not
replace
:
# We replace if *all* clients are on the GPU
# We replace if *all* clients are on the GPU
clients
=
[
c
for
o
in
node
.
outputs
for
c
in
o
.
clients
]
clients
=
[
c
for
o
in
node
.
outputs
for
c
in
o
.
clients
]
...
@@ -158,7 +204,9 @@ def op_lifter(OP, cuda_only=False):
...
@@ -158,7 +204,9 @@ def op_lifter(OP, cuda_only=False):
# the context was derived from the outputs
# the context was derived from the outputs
for
i
in
node
.
inputs
:
for
i
in
node
.
inputs
:
i
.
tag
.
context_name
=
context_name
i
.
tag
.
context_name
=
context_name
new_op
=
maker
(
node
,
context_name
)
new_op
=
maker
(
node
.
op
,
context_name
,
node
.
inputs
,
node
.
outputs
)
# This is needed as sometimes new_op inherits from OP.
# This is needed as sometimes new_op inherits from OP.
if
new_op
and
new_op
!=
node
.
op
:
if
new_op
and
new_op
!=
node
.
op
:
if
isinstance
(
new_op
,
theano
.
Op
):
if
isinstance
(
new_op
,
theano
.
Op
):
...
@@ -195,9 +243,13 @@ class InputToGpuOptimizer(Optimizer):
...
@@ -195,9 +243,13 @@ class InputToGpuOptimizer(Optimizer):
target
=
getattr
(
input
.
tag
,
'target'
,
None
)
target
=
getattr
(
input
.
tag
,
'target'
,
None
)
if
target
==
'cpu'
:
if
target
==
'cpu'
:
continue
continue
# Do not move *int* scalar to the GPU.
if
(
isinstance
(
input
.
type
,
tensor
.
TensorType
)
and
input
.
ndim
==
0
and
'int'
in
input
.
dtype
):
continue
try
:
try
:
new_input
=
host_from_gpu
(
GpuFromH
ost
(
target
)(
input
))
new_input
=
host_from_gpu
(
gpu_from_h
ost
(
target
)(
input
))
fgraph
.
replace_validate
(
input
,
new_input
,
fgraph
.
replace_validate
(
input
,
new_input
,
"InputToGpuOptimizer"
)
"InputToGpuOptimizer"
)
except
TypeError
:
except
TypeError
:
...
@@ -215,6 +267,222 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
...
@@ -215,6 +267,222 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
0
,
'fast_run'
,
'fast_compile'
,
'merge'
)
class
GraphToGPU
(
NavigatorOptimizer
):
"""
Transfer the graph as a whole to GPU instead of transfering node by node.
Parameters
----------
local_optimizers_all : List or SortedSet
The local optimizations to apply to a node.
local_optimizers_map : Dict
Dictionary object containing the mapping of Op to list of
LocalOptimizers.
"""
def
__init__
(
self
,
local_optimizers_all
,
local_optimizers_map
):
self
.
local_optimizers_all
=
local_optimizers_all
self
.
local_optimizers_map
=
local_optimizers_map
def
add_requirements
(
self
,
fgraph
):
fgraph
.
attach_feature
(
toolbox
.
ReplaceValidate
())
def
apply
(
self
,
fgraph
):
mapping
=
{}
time_opts
=
{}
node_created
=
{}
process_count
=
{}
t_topo
=
time
.
time
()
topo
=
fgraph
.
toposort
()
time_topo
=
time
.
time
()
toposort_timing
=
time_topo
-
t_topo
# Building a new graph
# Iterating through inputs of graph
target
=
infer_context_name
(
*
fgraph
.
inputs
)
for
i
in
fgraph
.
inputs
:
# Do not move *int* scalar to the GPU.
if
(
isinstance
(
i
.
type
,
tensor
.
TensorType
)
and
(
i
.
ndim
>
0
or
'int'
not
in
i
.
dtype
)):
mapping
[
i
]
=
i
.
transfer
(
getattr
(
i
.
tag
,
'target'
,
target
))
else
:
mapping
[
i
]
=
i
for
i
in
fgraph
.
variables
:
if
isinstance
(
i
,
theano
.
Constant
):
mapping
[
i
]
=
i
for
node
in
topo
:
for
lopt
in
(
self
.
local_optimizers_map
.
get
(
node
.
op
,
[])
+
self
.
local_optimizers_map
.
get
(
type
(
node
.
op
),
[])
+
self
.
local_optimizers_all
):
process_count
.
setdefault
(
lopt
,
0
)
time_opts
.
setdefault
(
lopt
,
0
)
node_created
.
setdefault
(
lopt
,
0
)
for
node
in
topo
:
if
isinstance
(
node
.
op
,
HostFromGpu
):
mapping
[
node
.
outputs
[
0
]]
=
mapping
[
node
.
inputs
[
0
]]
continue
# Move only if any of the inputs are on the GPU.
move_to_GPU
=
False
context_name
=
None
for
i
in
[
mapping
[
i
]
for
i
in
node
.
inputs
]:
if
isinstance
(
i
.
type
,
GpuArrayType
):
context_name
=
i
.
type
.
context_name
move_to_GPU
=
True
break
if
(
not
move_to_GPU
and
isinstance
(
node
.
op
,
(
theano
.
tensor
.
Alloc
,
theano
.
tensor
.
AllocEmpty
,
theano
.
tensor
.
basic
.
Eye
))):
# If the Alloc[Empty] have a client that will be moved
# to the GPU, we should move the Alloc* on the GPU.
# We approximate this by supposing that if we have an
# optimization for one of the clients op, then we will
# move the client to the GPU.
for
c
,
_
in
node
.
outputs
[
0
]
.
clients
:
if
(
c
!=
'output'
and
(
self
.
local_optimizers_map
.
get
(
c
.
op
,
[])
+
self
.
local_optimizers_map
.
get
(
type
(
c
.
op
),
[]))):
move_to_GPU
=
True
new_ops
=
None
# Apply the lifter
if
move_to_GPU
:
for
lopt
in
(
self
.
local_optimizers_map
.
get
(
node
.
op
,
[])
+
self
.
local_optimizers_map
.
get
(
type
(
node
.
op
),
[])
+
self
.
local_optimizers_all
):
t_opt
=
time
.
time
()
new_ops
=
lopt
.
transform
(
node
.
op
,
context_name
,
[
mapping
[
i
]
for
i
in
node
.
inputs
],
node
.
outputs
)
t_opt2
=
time
.
time
()
time_opts
[
lopt
]
+=
t_opt2
-
t_opt
if
new_ops
:
process_count
[
lopt
]
+=
1
break
outputs
=
[]
if
isinstance
(
new_ops
,
theano
.
Op
):
outputs
=
new_ops
(
*
[
mapping
[
i
]
for
i
in
node
.
inputs
],
return_list
=
True
)
elif
not
new_ops
:
newnode
=
node
.
clone_with_new_inputs
([
mapping
.
get
(
i
)
for
i
in
node
.
inputs
])
outputs
=
newnode
.
outputs
elif
isinstance
(
new_ops
,
(
tuple
,
list
)):
outputs
=
new_ops
elif
isinstance
(
new_ops
,
theano
.
Variable
):
outputs
=
[
new_ops
]
if
new_ops
:
node_created
[
lopt
]
+=
len
(
graph
.
ops
([
mapping
[
i
]
for
i
in
node
.
inputs
],
outputs
))
for
new_o
,
old_o
in
zip
(
outputs
,
node
.
outputs
):
assert
len
(
outputs
)
==
len
(
node
.
outputs
)
mapping
[
old_o
]
=
new_o
new_nodes
=
[]
for
o
in
fgraph
.
outputs
:
new_o
=
mapping
[
o
]
if
new_o
.
type
!=
o
.
type
:
assert
isinstance
(
o
.
type
,
tensor
.
TensorType
)
assert
isinstance
(
new_o
.
type
,
GpuArrayType
)
# This condition is needed in the case one input is an
# output of the graph. Without this, it would
# introduce cycle as we don't replace correctly that
# case. It would also add extra transfer to/from the
# gpu.
if
(
new_o
.
owner
and
isinstance
(
new_o
.
owner
.
op
,
GpuFromHost
)
and
new_o
.
owner
.
inputs
[
0
]
.
type
==
o
.
type
):
new_o
=
new_o
.
owner
.
inputs
[
0
]
else
:
new_o
=
safe_to_cpu
(
new_o
)
new_nodes
.
append
(
new_o
)
fgraph
.
replace_all_validate
(
zip
(
fgraph
.
outputs
,
new_nodes
),
reason
=
self
.
__class__
.
__name__
)
return
(
self
,
toposort_timing
,
time_opts
,
node_created
,
process_count
)
@staticmethod
def
print_profile
(
stream
,
prof
,
level
=
0
):
(
opt
,
toposort_timing
,
time_opts
,
node_created
,
process_count
)
=
prof
blanc
=
(
' '
*
level
)
print
(
blanc
,
"GraphToGPUOptimizer"
,
end
=
' '
,
file
=
stream
)
print
(
blanc
,
getattr
(
opt
,
"name"
,
getattr
(
opt
,
"__name__"
,
""
)),
file
=
stream
)
print
(
blanc
,
" time io_toposort
%.3
fs"
%
toposort_timing
,
file
=
stream
)
s
=
sum
([
v
for
k
,
v
in
time_opts
.
iteritems
()])
print
(
blanc
,
"Total time taken by local optimizers
%.3
fs "
%
s
,
file
=
stream
)
count_opt
=
[]
not_used
=
[]
not_used_time
=
0
for
o
,
count
in
iteritems
(
process_count
):
if
count
>
0
:
count_opt
.
append
((
time_opts
[
o
],
count
,
node_created
[
o
],
o
))
else
:
not_used
.
append
((
time_opts
[
o
],
o
))
not_used_time
+=
time_opts
[
o
]
if
count_opt
:
print
(
blanc
,
' times - times applied - Node created - name:'
,
file
=
stream
)
count_opt
.
sort
()
for
(
t
,
count
,
n_created
,
o
)
in
count_opt
[::
-
1
]:
print
(
blanc
,
'
%.3
fs -
%
d -
%
d -
%
s'
%
(
t
,
count
,
n_created
,
o
),
file
=
stream
)
print
(
blanc
,
'
%.3
fs - in
%
d optimization that were not used (display only those with a runtime > 0)'
%
(
not_used_time
,
len
(
not_used
)),
file
=
stream
)
not_used
.
sort
(
key
=
lambda
nu
:
(
nu
[
0
],
str
(
nu
[
1
])))
for
(
t
,
o
)
in
not_used
[::
-
1
]:
if
t
>
0
:
# Skip opt that have 0 times, they probably wasn't even tried.
print
(
blanc
+
" "
,
'
%.3
fs -
%
s'
%
(
t
,
o
),
file
=
stream
)
print
(
file
=
stream
)
@staticmethod
def
merge_profile
(
prof1
,
prof2
):
# (opt, toposort_timing, time_opts, node_created, process_count) = prof1
local_optimizers
=
OrderedSet
(
prof1
[
0
]
.
local_optimizers_all
)
.
union
(
prof2
[
0
]
.
local_optimizers_all
)
def
merge_dict
(
d1
,
d2
):
"""
merge 2 dicts by adding the values.
"""
d
=
d1
.
copy
()
for
k
,
v
in
iteritems
(
d2
):
if
k
in
d
:
d
[
k
]
+=
v
else
:
d
[
k
]
=
v
return
d
local_optimizers_map
=
merge_dict
(
prof1
[
0
]
.
local_optimizers_map
,
prof2
[
0
]
.
local_optimizers_map
)
new_opt
=
GraphToGPU
(
local_optimizers
,
local_optimizers_map
)
toposort_timing
=
prof1
[
1
]
+
prof2
[
1
]
time_opts
=
merge_dict
(
prof1
[
2
],
prof2
[
2
])
node_created
=
merge_dict
(
prof1
[
3
],
prof2
[
3
])
process_count
=
merge_dict
(
prof1
[
4
],
prof2
[
4
])
return
(
new_opt
,
toposort_timing
,
time_opts
,
node_created
,
process_count
)
@local_optimizer
([
GpuFromHost
,
GpuToGpu
,
HostFromGpu
])
@local_optimizer
([
GpuFromHost
,
GpuToGpu
,
HostFromGpu
])
def
local_cut_gpu_transfers
(
node
):
def
local_cut_gpu_transfers
(
node
):
# gpu[ab] -> host -> gpub
# gpu[ab] -> host -> gpub
...
@@ -273,7 +541,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
...
@@ -273,7 +541,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@local_optimizer
([
tensor
.
Alloc
])
@local_optimizer
([
tensor
.
Alloc
])
def
local_gpuaalloc2
(
node
):
def
local_gpua
_
alloc2
(
node
):
"""
"""
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
...
@@ -292,22 +560,23 @@ def local_gpuaalloc2(node):
...
@@ -292,22 +560,23 @@ def local_gpuaalloc2(node):
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
i
.
owner
.
op
in
[
host_from_gpu
,
tensor
.
alloc
]
for
i
in
c
.
inputs
[
1
:])
for
i
in
c
.
inputs
[
1
:])
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
)):
for
c
,
idx
in
node
.
outputs
[
0
]
.
clients
)):
return
[
host_from_gpu
(
GpuA
lloc
(
None
)(
*
node
.
inputs
))]
return
[
host_from_gpu
(
gpu_a
lloc
(
None
)(
*
node
.
inputs
))]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Alloc
])
@op_lifter
([
tensor
.
Alloc
])
def
local_gpuaalloc
(
node
,
context_name
):
@register_opt2
([
tensor
.
Alloc
],
'fast_compile'
)
return
GpuAlloc
(
context_name
)(
*
node
.
inputs
)
def
local_gpua_alloc
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_alloc
(
context_name
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AllocEmpty
])
@op_lifter
([
tensor
.
AllocEmpty
])
def
local_gpuaallocempty
(
node
,
context_name
):
@register_opt2
([
tensor
.
AllocEmpty
],
'fast_compile'
)
def
local_gpua_alloc_empty
(
op
,
context_name
,
inputs
,
outputs
):
# We use _props_dict() to make sure that the GPU op know all the
# We use _props_dict() to make sure that the GPU op know all the
# CPU op props.
# CPU op props.
return
GpuAllocEmpty
(
context_name
=
context_name
,
return
gpu_alloc_empty
(
context_name
,
**
op
.
_props_dict
())
**
node
.
op
.
_props_dict
())(
*
node
.
inputs
)
@register_opt
()
@register_opt
()
...
@@ -318,7 +587,7 @@ def local_gpualloc_memset_0(node):
...
@@ -318,7 +587,7 @@ def local_gpualloc_memset_0(node):
if
(
isinstance
(
inp
,
GpuArrayConstant
)
and
if
(
isinstance
(
inp
,
GpuArrayConstant
)
and
inp
.
data
.
size
==
1
and
inp
.
data
.
size
==
1
and
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
(
numpy
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_op
=
GpuA
lloc
(
node
.
op
.
context_name
,
memset_0
=
True
)
new_op
=
gpu_a
lloc
(
node
.
op
.
context_name
,
memset_0
=
True
)
return
[
new_op
(
*
node
.
inputs
)]
return
[
new_op
(
*
node
.
inputs
)]
...
@@ -328,8 +597,8 @@ def local_gpua_alloc_empty_to_zeros(node):
...
@@ -328,8 +597,8 @@ def local_gpua_alloc_empty_to_zeros(node):
if
isinstance
(
node
.
op
,
GpuAllocEmpty
):
if
isinstance
(
node
.
op
,
GpuAllocEmpty
):
context_name
=
infer_context_name
(
*
node
.
inputs
)
context_name
=
infer_context_name
(
*
node
.
inputs
)
z
=
numpy
.
asarray
(
0
,
dtype
=
node
.
outputs
[
0
]
.
dtype
)
z
=
numpy
.
asarray
(
0
,
dtype
=
node
.
outputs
[
0
]
.
dtype
)
return
[
GpuAlloc
(
)(
as_gpuarray_variable
(
z
,
context_name
),
return
[
gpu_alloc
(
context_name
)(
as_gpuarray_variable
(
z
,
context_name
),
*
node
.
inputs
)]
*
node
.
inputs
)]
optdb
.
register
(
'local_gpua_alloc_empty_to_zeros'
,
optdb
.
register
(
'local_gpua_alloc_empty_to_zeros'
,
theano
.
tensor
.
opt
.
in2out
(
local_gpua_alloc_empty_to_zeros
),
theano
.
tensor
.
opt
.
in2out
(
local_gpua_alloc_empty_to_zeros
),
# After move to gpu and merge2, before inplace.
# After move to gpu and merge2, before inplace.
...
@@ -352,14 +621,15 @@ def local_gpu_contiguous_gpu_contiguous(node):
...
@@ -352,14 +621,15 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
extra_ops
.
CpuContiguous
])
@op_lifter
([
tensor
.
extra_ops
.
CpuContiguous
])
def
local_gpu_contiguous
(
node
,
context_name
):
@register_opt2
([
tensor
.
extra_ops
.
CpuContiguous
],
'fast_compile'
)
def
local_gpua_contiguous
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_contiguous
return
gpu_contiguous
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Reshape
])
@op_lifter
([
tensor
.
Reshape
])
def
local_gpureshape
(
node
,
context_name
):
@register_opt2
([
tensor
.
Reshape
],
'fast_compile'
)
op
=
node
.
op
def
local_gpua_reshape
(
op
,
context_name
,
inputs
,
outputs
):
name
=
op
.
name
name
=
op
.
name
if
name
:
if
name
:
name
=
'Gpu'
+
name
name
=
'Gpu'
+
name
...
@@ -369,32 +639,33 @@ def local_gpureshape(node, context_name):
...
@@ -369,32 +639,33 @@ def local_gpureshape(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Rebroadcast
])
@op_lifter
([
tensor
.
Rebroadcast
])
def
local_gpu_rebroadcast
(
node
,
context_name
):
@register_opt2
([
tensor
.
Rebroadcast
],
'fast_compile'
)
return
node
.
op
(
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
))
def
local_gpua_rebroadcast
(
op
,
context_name
,
inputs
,
outputs
):
return
op
(
as_gpuarray_variable
(
inputs
[
0
],
context_name
))
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Flatten
])
@op_lifter
([
tensor
.
Flatten
])
def
local_gpuflatten
(
node
,
context_name
):
@register_opt2
([
tensor
.
Flatten
],
'fast_compile'
)
op
=
node
.
op
def
local_gpua_flatten
(
op
,
context_name
,
inputs
,
outputs
):
shp
=
[]
shp
=
[]
if
op
.
outdim
!=
1
:
if
op
.
outdim
!=
1
:
shp
=
[
node
.
inputs
[
0
]
.
shape
[
i
]
for
i
in
range
(
op
.
outdim
-
1
)]
shp
=
[
inputs
[
0
]
.
shape
[
i
]
for
i
in
range
(
op
.
outdim
-
1
)]
shp
+=
[
-
1
]
shp
+=
[
-
1
]
res
=
GpuReshape
(
op
.
outdim
,
None
)
res
=
GpuReshape
(
op
.
outdim
,
None
)
o
=
res
(
node
.
inputs
[
0
],
theano
.
tensor
.
as_tensor_variable
(
shp
))
o
=
res
(
inputs
[
0
],
theano
.
tensor
.
as_tensor_variable
(
shp
))
return
o
return
o
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Elemwise
])
@op_lifter
([
tensor
.
Elemwise
])
def
local_gpu_elemwise
(
node
,
context_name
):
@register_opt2
([
tensor
.
Elemwise
],
'fast_compile'
)
op
=
node
.
op
def
local_gpua_elemwise
(
op
,
context_name
,
inputs
,
outputs
):
scal_op
=
op
.
scalar_op
scal_op
=
op
.
scalar_op
name
=
op
.
name
name
=
op
.
name
if
name
:
if
name
:
name
=
'Gpu'
+
name
name
=
'Gpu'
+
name
if
len
(
node
.
outputs
)
>
1
:
if
len
(
outputs
)
>
1
:
return
return
res
=
GpuElemwise
(
scal_op
,
name
=
name
,
res
=
GpuElemwise
(
scal_op
,
name
=
name
,
inplace_pattern
=
copy
.
copy
(
op
.
inplace_pattern
),
inplace_pattern
=
copy
.
copy
(
op
.
inplace_pattern
),
...
@@ -407,13 +678,13 @@ def local_gpu_elemwise(node, context_name):
...
@@ -407,13 +678,13 @@ def local_gpu_elemwise(node, context_name):
# Only transfer the computation on the gpu if the output dtype is
# Only transfer the computation on the gpu if the output dtype is
# floating point. Else, give up on the transfer to the gpu.
# floating point. Else, give up on the transfer to the gpu.
out_dtype
=
node
.
outputs
[
0
]
.
dtype
out_dtype
=
outputs
[
0
]
.
dtype
if
out_dtype
not
in
[
'float16'
,
'float32'
,
'float64'
]:
if
out_dtype
not
in
[
'float16'
,
'float32'
,
'float64'
]:
return
return
# Transfer the inputs on the GPU and cast them to the right dtype.
# Transfer the inputs on the GPU and cast them to the right dtype.
new_inputs
=
[]
new_inputs
=
[]
for
inp
in
node
.
inputs
:
for
inp
in
inputs
:
if
inp
.
dtype
!=
out_dtype
:
if
inp
.
dtype
!=
out_dtype
:
gpu_cast_op
=
GpuElemwise
(
Cast
(
Scalar
(
out_dtype
)))
gpu_cast_op
=
GpuElemwise
(
Cast
(
Scalar
(
out_dtype
)))
new_inputs
.
append
(
gpu_cast_op
(
as_gpuarray_variable
(
inp
,
context_name
)))
new_inputs
.
append
(
gpu_cast_op
(
as_gpuarray_variable
(
inp
,
context_name
)))
...
@@ -423,8 +694,7 @@ def local_gpu_elemwise(node, context_name):
...
@@ -423,8 +694,7 @@ def local_gpu_elemwise(node, context_name):
# Perform the exponent on the gpu and transfer the output back to the
# Perform the exponent on the gpu and transfer the output back to the
# cpu.
# cpu.
gpu_output
=
res
(
*
new_inputs
)
gpu_output
=
res
(
*
new_inputs
)
cpu_output
=
host_from_gpu
(
gpu_output
)
return
[
gpu_output
]
return
[
cpu_output
]
else
:
else
:
return
res
return
res
...
@@ -461,29 +731,41 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
...
@@ -461,29 +731,41 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
DimShuffle
])
@op_lifter
([
tensor
.
DimShuffle
])
def
local_gpua_dimshuffle
(
node
,
context_name
):
@register_opt2
([
tensor
.
DimShuffle
],
'fast_compile'
)
return
GpuDimShuffle
(
node
.
op
.
input_broadcastable
,
def
local_gpua_dimshuffle
(
op
,
context_name
,
inputs
,
outputs
):
node
.
op
.
new_order
)
return
GpuDimShuffle
(
op
.
input_broadcastable
,
op
.
new_order
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
SpecifyShape
])
@op_lifter
([
tensor
.
SpecifyShape
])
def
local_gpua_specifyShape
(
node
,
context_name
):
@register_opt2
([
tensor
.
SpecifyShape
],
'fast_compile'
)
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
def
local_gpua_specifyShape
(
op
,
context_name
,
inputs
,
outputs
):
if
isinstance
(
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
inp
=
[
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
)]
return
local_gpua_specifyShape_graph
(
op
,
context_name
,
inputs
,
outputs
)
inp
+=
node
.
inputs
[
1
:]
@register_opt2
([
tensor
.
SpecifyShape
],
'fast_compile'
)
def
local_gpua_specifyShape_graph
(
op
,
context_name
,
inputs
,
outputs
):
inp
=
[
as_gpuarray_variable
(
inputs
[
0
],
context_name
)]
inp
+=
inputs
[
1
:]
return
tensor
.
specify_shape
(
*
inp
)
return
tensor
.
specify_shape
(
*
inp
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
@op_lifter
([
theano
.
compile
.
ops
.
Shape
])
def
local_gpua_shape
(
node
,
context_name
):
def
local_gpua_shape
(
op
,
context_name
,
inputs
,
outputs
):
# op_lifter will call this opt too frequently as the output is
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
# always on the CPU.
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
return
[
as_gpuarray_variable
(
node
.
inputs
[
0
],
context_name
)
.
shape
]
return
local_gpua_shape_graph
(
op
,
context_name
,
inputs
,
outputs
)
@register_opt2
([
tensor
.
compile
.
ops
.
Shape
],
'fast_compile'
)
def
local_gpua_shape_graph
(
op
,
context_name
,
inputs
,
outputs
):
return
[
as_gpuarray_variable
(
inputs
[
0
],
context_name
)
.
shape
]
def
gpu_print_wrapper
(
op
,
cnda
):
def
gpu_print_wrapper
(
op
,
cnda
):
...
@@ -492,11 +774,12 @@ def gpu_print_wrapper(op, cnda):
...
@@ -492,11 +774,12 @@ def gpu_print_wrapper(op, cnda):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
printing
.
Print
])
@op_lifter
([
tensor
.
printing
.
Print
])
def
local_gpu_print_op
(
node
,
context_name
):
@register_opt2
([
tensor
.
printing
.
Print
],
'fast_compile'
)
x
,
=
node
.
inputs
def
local_gpua_print_op
(
op
,
context_name
,
inputs
,
outputs
):
x
,
=
inputs
gpu_x
=
as_gpuarray_variable
(
x
,
context_name
=
context_name
)
gpu_x
=
as_gpuarray_variable
(
x
,
context_name
=
context_name
)
new_op
=
node
.
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
new_op
=
op
.
__class__
(
global_fn
=
gpu_print_wrapper
)
new_op
.
old_op
=
node
.
op
new_op
.
old_op
=
op
return
new_op
(
gpu_x
)
return
new_op
(
gpu_x
)
...
@@ -570,28 +853,30 @@ def local_gpu_pdbbreakpoint_op(node):
...
@@ -570,28 +853,30 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
IfElse
])
@op_lifter
([
IfElse
])
def
local_gpua_lazy_ifelse
(
node
,
context_name
):
@register_opt2
([
IfElse
],
'fast_compile'
)
if
node
.
op
.
gpu
:
def
local_gpua_lazy_ifelse
(
op
,
context_name
,
inputs
,
outputs
):
if
op
.
gpu
:
return
return
c
=
node
.
inputs
[
0
]
c
=
inputs
[
0
]
inps
=
[]
inps
=
[]
for
v
in
node
.
inputs
[
1
:]:
for
v
in
inputs
[
1
:]:
if
isinstance
(
v
.
type
,
(
tensor
.
TensorType
,
GpuArrayType
)
):
if
isinstance
(
v
.
type
,
tensor
.
TensorType
):
inps
.
append
(
as_gpuarray_variable
(
v
,
context_name
))
inps
.
append
(
as_gpuarray_variable
(
v
,
context_name
))
else
:
else
:
inps
.
append
(
v
)
inps
.
append
(
v
)
return
IfElse
(
node
.
op
.
n_outs
,
gpu
=
True
)(
c
,
*
inps
,
return_list
=
True
)
return
IfElse
(
op
.
n_outs
,
gpu
=
True
)(
c
,
*
inps
,
return_list
=
True
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Join
])
@op_lifter
([
tensor
.
Join
])
def
local_gpua_join
(
node
,
context_name
):
@register_opt2
([
tensor
.
Join
],
'fast_compile'
)
def
local_gpua_join
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_join
return
gpu_join
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@local_optimizer
([
GpuJoin
])
@local_optimizer
([
GpuJoin
])
def
local_gpuajoin_1
(
node
):
def
local_gpua
_
join_1
(
node
):
# join of a single element
# join of a single element
if
(
isinstance
(
node
.
op
,
GpuJoin
)
and
if
(
isinstance
(
node
.
op
,
GpuJoin
)
and
len
(
node
.
inputs
)
==
2
):
len
(
node
.
inputs
)
==
2
):
...
@@ -600,14 +885,16 @@ def local_gpuajoin_1(node):
...
@@ -600,14 +885,16 @@ def local_gpuajoin_1(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Split
])
@op_lifter
([
tensor
.
Split
])
def
local_gpua_split
(
node
,
context_name
):
@register_opt2
([
tensor
.
Split
],
'fast_compile'
)
return
GpuSplit
(
node
.
op
.
len_splits
)
def
local_gpua_split
(
op
,
context_name
,
inputs
,
outputs
):
# TODO use props
return
GpuSplit
(
op
.
len_splits
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
Subtensor
])
@op_lifter
([
tensor
.
Subtensor
])
def
local_gpua_subtensor
(
node
,
context_name
):
def
local_gpua_subtensor
(
op
,
context_name
,
inputs
,
outputs
):
x
=
node
.
inputs
[
0
]
x
=
inputs
[
0
]
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)):
gpu_x
=
x
.
owner
.
inputs
[
0
]
gpu_x
=
x
.
owner
.
inputs
[
0
]
if
(
gpu_x
.
owner
and
if
(
gpu_x
.
owner
and
...
@@ -617,41 +904,68 @@ def local_gpua_subtensor(node, context_name):
...
@@ -617,41 +904,68 @@ def local_gpua_subtensor(node, context_name):
if
len
(
x
.
clients
)
==
1
:
if
len
(
x
.
clients
)
==
1
:
if
any
([
n
==
'output'
or
any
([
isinstance
(
v
.
type
,
GpuArrayType
)
if
any
([
n
==
'output'
or
any
([
isinstance
(
v
.
type
,
GpuArrayType
)
for
v
in
n
.
inputs
+
n
.
outputs
])
for
v
in
n
.
inputs
+
n
.
outputs
])
for
n
,
_
in
node
.
outputs
[
0
]
.
clients
]):
for
n
,
_
in
outputs
[
0
]
.
clients
]):
return
return
else
:
else
:
return
[
host_from_gpu
(
gpu_x
.
owner
.
op
(
node
.
outputs
[
0
]))]
return
[
host_from_gpu
(
gpu_x
.
owner
.
op
(
outputs
[
0
]))]
return
GpuSubtensor
(
node
.
op
.
idx_list
)
return
GpuSubtensor
(
op
.
idx_list
)
@register_opt2
([
tensor
.
Subtensor
],
'fast_compile'
)
def
local_gpua_subtensor_graph
(
op
,
context_name
,
inputs
,
outputs
):
# We need different code as the condition is different as inputs
# aren't the same.
x
=
inputs
[
0
]
# We don't want to move the subtensor to the GPU if the inputs is
# on the CPU and the only client of the CPU node is this
# subtensor. This allow to have a smaller transfer.
if
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
GpuFromHost
)):
cpu_x
=
x
.
owner
.
inputs
[
0
]
# And it is a shared var or an input of the graph.
# and is used by only 1 node.
# x is in the new graph, so we can't tests its number of clients.
if
not
cpu_x
.
owner
and
len
(
cpu_x
.
clients
)
==
1
:
c
=
outputs
[
0
]
.
clients
# If the subtensor have only 1 client, do it on the CPU.
# We let the other optimization to take care to move the
# next node or not.
if
len
(
c
)
==
1
:
return
return
GpuSubtensor
(
op
.
idx_list
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
IncSubtensor
])
@op_lifter
([
tensor
.
IncSubtensor
])
def
local_gpua_incsubtensor
(
node
,
context_name
):
@register_opt2
([
tensor
.
IncSubtensor
],
'fast_compile'
)
op
=
GpuIncSubtensor
(
node
.
op
.
idx_list
,
node
.
op
.
inplace
,
def
local_gpua_inc_subtensor
(
op
,
context_name
,
inputs
,
outputs
):
node
.
op
.
set_instead_of_inc
,
op
=
GpuIncSubtensor
(
op
.
idx_list
,
op
.
inplace
,
node
.
op
.
destroyhandler_tolerate_aliased
)
op
.
set_instead_of_inc
,
ret
=
op
(
*
node
.
inputs
)
op
.
destroyhandler_tolerate_aliased
)
val
=
getattr
(
node
.
outputs
[
0
]
.
tag
,
'nan_guard_mode_check'
,
True
)
ret
=
op
(
*
inputs
)
val
=
getattr
(
outputs
[
0
]
.
tag
,
'nan_guard_mode_check'
,
True
)
ret
.
tag
.
nan_guard_mode_check
=
val
ret
.
tag
.
nan_guard_mode_check
=
val
return
ret
return
ret
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
@op_lifter
([
tensor
.
AdvancedSubtensor1
])
def
local_gpua_advanced_subtensor
(
node
,
context_name
):
@register_opt2
([
tensor
.
AdvancedSubtensor1
],
'fast_compile'
)
def
local_gpua_advanced_subtensor
(
op
,
context_name
,
inputs
,
outputs
):
return
GpuAdvancedSubtensor1
()
return
GpuAdvancedSubtensor1
()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
@op_lifter
([
tensor
.
AdvancedIncSubtensor1
])
def
local_gpua_advanced_incsubtensor
(
node
,
context_name
):
@register_opt2
([
tensor
.
AdvancedIncSubtensor1
],
'fast_compile'
)
def
local_gpua_advanced_incsubtensor
(
op
,
context_name
,
inputs
,
outputs
):
context
=
get_context
(
context_name
)
context
=
get_context
(
context_name
)
# This is disabled on non-cuda contexts
# This is disabled on non-cuda contexts
if
context
.
kind
!=
b
'cuda'
:
if
context
.
kind
!=
b
'cuda'
:
return
None
return
None
x
,
y
,
ilist
=
node
.
inputs
x
,
y
,
ilist
=
inputs
# Gpu Ops needs both inputs to have the same dtype
# Gpu Ops needs both inputs to have the same dtype
if
(
x
.
type
.
dtype
!=
y
.
type
.
dtype
):
if
(
x
.
type
.
dtype
!=
y
.
type
.
dtype
):
...
@@ -661,7 +975,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
...
@@ -661,7 +975,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
if
y
.
type
.
dtype
!=
dtype
:
if
y
.
type
.
dtype
!=
dtype
:
y
=
tensor
.
cast
(
y
,
dtype
)
y
=
tensor
.
cast
(
y
,
dtype
)
set_instead_of_inc
=
node
.
op
.
set_instead_of_inc
set_instead_of_inc
=
op
.
set_instead_of_inc
compute_capability
=
int
(
context
.
bin_id
[
-
2
])
compute_capability
=
int
(
context
.
bin_id
[
-
2
])
...
@@ -684,29 +998,31 @@ def local_advincsub1_gpua_inplace(node):
...
@@ -684,29 +998,31 @@ def local_advincsub1_gpua_inplace(node):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
@op_lifter
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
])
def
local_gpua_careduce
(
node
,
context_name
):
@register_opt2
([
tensor
.
CAReduce
,
tensor
.
Sum
,
tensor
.
elemwise
.
Prod
],
'fast_compile'
)
if
isinstance
(
node
.
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
def
local_gpua_careduce
(
op
,
context_name
,
inputs
,
outputs
):
scalar
.
Maximum
,
scalar
.
Minimum
)):
if
isinstance
(
op
.
scalar_op
,
(
scalar
.
Add
,
scalar
.
Mul
,
scalar
.
Maximum
,
scalar
.
Minimum
)):
ctx
=
get_context
(
context_name
)
ctx
=
get_context
(
context_name
)
if
ctx
.
kind
==
b
'opencl'
:
if
ctx
.
kind
==
b
'opencl'
:
op
=
GpuCAReduceCPY
op
2
=
GpuCAReduceCPY
if
node
.
op
.
scalar_op
not
in
[
scalar
.
add
,
scalar
.
mul
]:
if
op
.
scalar_op
not
in
[
scalar
.
add
,
scalar
.
mul
]:
# We don't support yet all reduction with cpy code.
# We don't support yet all reduction with cpy code.
return
return
elif
ctx
.
kind
==
b
'cuda'
:
elif
ctx
.
kind
==
b
'cuda'
:
op
=
GpuCAReduceCuda
op
2
=
GpuCAReduceCuda
else
:
else
:
return
False
return
False
x
,
=
node
.
inputs
x
,
=
inputs
greduce
=
op
(
greduce
=
op
2
(
node
.
op
.
scalar_op
,
axis
=
node
.
op
.
axis
,
op
.
scalar_op
,
axis
=
op
.
axis
,
dtype
=
getattr
(
node
.
op
,
'dtype'
,
None
),
dtype
=
getattr
(
op
,
'dtype'
,
None
),
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
acc_dtype
=
getattr
(
op
,
'acc_dtype'
,
None
))
gvar
=
greduce
(
x
)
gvar
=
greduce
(
x
)
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
if
(
op
is
GpuCAReduceCPY
or
if
(
op
2
is
GpuCAReduceCPY
or
gvar
.
owner
.
op
.
supports_c_code
([
gvar
.
owner
.
op
.
supports_c_code
([
as_gpuarray_variable
(
x
,
context_name
)])):
as_gpuarray_variable
(
x
,
context_name
)])):
return
greduce
return
greduce
...
@@ -717,11 +1033,11 @@ def local_gpua_careduce(node, context_name):
...
@@ -717,11 +1033,11 @@ def local_gpua_careduce(node, context_name):
# to make them a single dimension, do the reduction, and
# to make them a single dimension, do the reduction, and
# then reshape to get them back.
# then reshape to get them back.
if
node
.
op
.
axis
is
None
:
if
op
.
axis
is
None
:
reduce_mask
=
[
1
]
*
x
.
type
.
ndim
reduce_mask
=
[
1
]
*
x
.
type
.
ndim
else
:
else
:
reduce_mask
=
[
0
]
*
x
.
type
.
ndim
reduce_mask
=
[
0
]
*
x
.
type
.
ndim
for
a
in
node
.
op
.
axis
:
for
a
in
op
.
axis
:
assert
reduce_mask
[
a
]
==
0
assert
reduce_mask
[
a
]
==
0
reduce_mask
[
a
]
=
1
reduce_mask
[
a
]
=
1
...
@@ -737,11 +1053,11 @@ def local_gpua_careduce(node, context_name):
...
@@ -737,11 +1053,11 @@ def local_gpua_careduce(node, context_name):
for
idx
,
m
in
enumerate
(
new_mask
):
for
idx
,
m
in
enumerate
(
new_mask
):
if
m
==
1
:
if
m
==
1
:
new_axis
.
append
(
idx
)
new_axis
.
append
(
idx
)
greduce
=
op
(
greduce
=
op
2
(
node
.
op
.
scalar_op
,
op
.
scalar_op
,
axis
=
new_axis
,
reduce_mask
=
new_mask
,
axis
=
new_axis
,
reduce_mask
=
new_mask
,
dtype
=
getattr
(
node
.
op
,
'dtype'
,
None
),
dtype
=
getattr
(
op
,
'dtype'
,
None
),
acc_dtype
=
getattr
(
node
.
op
,
'acc_dtype'
,
None
))
acc_dtype
=
getattr
(
op
,
'acc_dtype'
,
None
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
gpu_reshaped_x
=
as_gpuarray_variable
(
reshaped_x
,
context_name
)
gpu_reshaped_x
=
as_gpuarray_variable
(
reshaped_x
,
context_name
)
...
@@ -750,16 +1066,15 @@ def local_gpua_careduce(node, context_name):
...
@@ -750,16 +1066,15 @@ def local_gpua_careduce(node, context_name):
# be None
# be None
reshaped_gpu_inputs
=
[
gpu_reshaped_x
]
reshaped_gpu_inputs
=
[
gpu_reshaped_x
]
if
greduce
.
supports_c_code
(
reshaped_gpu_inputs
):
if
greduce
.
supports_c_code
(
reshaped_gpu_inputs
):
reduce_reshaped_x
=
host_from_gpu
(
reduce_reshaped_x
=
greduce
(
gpu_reshaped_x
)
greduce
(
gpu_reshaped_x
))
if
reduce_reshaped_x
.
ndim
!=
node
.
outputs
[
0
]
.
ndim
:
if
reduce_reshaped_x
.
ndim
!=
outputs
[
0
]
.
ndim
:
out_shp
=
[]
out_shp
=
[]
for
i
in
range
(
x
.
ndim
):
for
i
in
range
(
x
.
ndim
):
if
i
not
in
node
.
op
.
axis
:
if
i
not
in
op
.
axis
:
out_shp
.
append
(
shape_i
(
x
,
i
))
out_shp
.
append
(
shape_i
(
x
,
i
))
unreshaped_reduce
=
reduce_reshaped_x
.
reshape
(
unreshaped_reduce
=
GpuReshape
(
len
(
out_shp
))(
reduce_reshaped_x
,
tensor
.
stack
(
out_shp
))
tensor
.
stack
(
out_shp
))
else
:
else
:
unreshaped_reduce
=
reduce_reshaped_x
unreshaped_reduce
=
reduce_reshaped_x
return
[
unreshaped_reduce
]
return
[
unreshaped_reduce
]
...
@@ -767,8 +1082,9 @@ def local_gpua_careduce(node, context_name):
...
@@ -767,8 +1082,9 @@ def local_gpua_careduce(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
@op_lifter
([
tensor
.
blas
.
Gemv
,
tensor
.
blas_c
.
CGemv
])
def
local_gpua_gemv
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
Gemv
],
'fast_compile'
)
if
node
.
op
.
inplace
:
def
local_gpua_gemv
(
op
,
context_name
,
inputs
,
outputs
):
if
op
.
inplace
:
return
gpugemv_inplace
return
gpugemv_inplace
else
:
else
:
return
gpugemv_no_inplace
return
gpugemv_no_inplace
...
@@ -776,8 +1092,9 @@ def local_gpua_gemv(node, context_name):
...
@@ -776,8 +1092,9 @@ def local_gpua_gemv(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Gemm
])
@op_lifter
([
tensor
.
blas
.
Gemm
])
def
local_gpua_gemm
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
Gemm
],
'fast_compile'
)
if
node
.
op
.
inplace
:
def
local_gpua_gemm
(
op
,
context_name
,
inputs
,
outputs
):
if
op
.
inplace
:
return
gpugemm_inplace
return
gpugemm_inplace
else
:
else
:
return
gpugemm_no_inplace
return
gpugemm_no_inplace
...
@@ -785,27 +1102,29 @@ def local_gpua_gemm(node, context_name):
...
@@ -785,27 +1102,29 @@ def local_gpua_gemm(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
BatchedDot
])
@op_lifter
([
tensor
.
blas
.
BatchedDot
])
def
local_gpua_gemmbatch
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
BatchedDot
],
'fast_compile'
)
a
,
b
=
node
.
inputs
def
local_gpua_gemmbatch
(
op
,
context_name
,
inputs
,
outputs
):
a
,
b
=
inputs
c
=
tensor
.
AllocEmpty
(
a
.
dtype
)(
a
.
shape
[
0
],
a
.
shape
[
1
],
b
.
shape
[
2
])
c
=
tensor
.
AllocEmpty
(
a
.
dtype
)(
a
.
shape
[
0
],
a
.
shape
[
1
],
b
.
shape
[
2
])
return
gpugemmbatch_no_inplace
(
c
,
1.0
,
a
,
b
,
0.0
)
return
gpugemmbatch_no_inplace
(
c
,
1.0
,
a
,
b
,
0.0
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Dot
])
@op_lifter
([
tensor
.
basic
.
Dot
])
def
local_gpua_hgemm
(
node
,
context_name
):
@register_opt2
([
tensor
.
basic
.
Dot
],
'fast_compile'
)
def
local_gpua_hgemm
(
op
,
context_name
,
inputs
,
outputs
):
from
theano.sandbox.cuda
import
nvcc_compiler
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
_logger
.
warning
(
"Not performing dot of float16 on the GPU since "
_logger
.
warning
(
"Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up "
"cuda 7.5 is not available. Updating could speed up "
"your code."
)
"your code."
)
return
return
A
=
node
.
inputs
[
0
]
A
=
inputs
[
0
]
B
=
node
.
inputs
[
1
]
B
=
inputs
[
1
]
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
if
(
A
.
ndim
==
2
and
B
.
ndim
==
2
and
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
A
.
dtype
==
'float16'
and
B
.
dtype
==
'float16'
):
fgraph
=
node
.
in
puts
[
0
]
.
fgraph
fgraph
=
out
puts
[
0
]
.
fgraph
C
=
GpuAllocEmpty
(
dtype
=
'float16'
,
context_name
=
context_name
)(
C
=
gpu_alloc_empty
(
context_name
,
dtype
=
'float16'
)(
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
A
,
0
,
fgraph
),
shape_i
(
B
,
1
,
fgraph
))
shape_i
(
B
,
1
,
fgraph
))
return
gpugemm_no_inplace
(
C
,
1.0
,
A
,
B
,
0.0
)
return
gpugemm_no_inplace
(
C
,
1.0
,
A
,
B
,
0.0
)
...
@@ -813,95 +1132,106 @@ def local_gpua_hgemm(node, context_name):
...
@@ -813,95 +1132,106 @@ def local_gpua_hgemm(node, context_name):
@register_opt
()
@register_opt
()
@alpha_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
)
@alpha_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
)
def
local_gpuagemm_alpha_merge
(
node
,
*
inputs
):
def
local_gpua
_
gemm_alpha_merge
(
node
,
*
inputs
):
return
[
gpugemm_no_inplace
(
*
inputs
)]
return
[
gpugemm_no_inplace
(
*
inputs
)]
@register_opt
()
@register_opt
()
@output_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
@output_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
def
local_gpuagemm_output_merge
(
node
,
*
inputs
):
def
local_gpua
_
gemm_output_merge
(
node
,
*
inputs
):
return
[
gpugemm_no_inplace
(
*
inputs
)]
return
[
gpugemm_no_inplace
(
*
inputs
)]
@register_opt
()
@register_opt
()
@alpha_merge
(
GpuGemmBatch
,
alpha_in
=
1
,
beta_in
=
4
)
@alpha_merge
(
GpuGemmBatch
,
alpha_in
=
1
,
beta_in
=
4
)
def
local_gpuagemmbatch_alpha_merge
(
node
,
*
inputs
):
def
local_gpua
_
gemmbatch_alpha_merge
(
node
,
*
inputs
):
return
[
gpugemmbatch_no_inplace
(
*
inputs
)]
return
[
gpugemmbatch_no_inplace
(
*
inputs
)]
@register_opt
()
@register_opt
()
@output_merge
(
GpuGemmBatch
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
@output_merge
(
GpuGemmBatch
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
def
local_gpuagemmbatch_output_merge
(
node
,
*
inputs
):
def
local_gpua
_
gemmbatch_output_merge
(
node
,
*
inputs
):
return
[
gpugemmbatch_no_inplace
(
*
inputs
)]
return
[
gpugemmbatch_no_inplace
(
*
inputs
)]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
@op_lifter
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
])
def
local_gpua_ger
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
Ger
,
tensor
.
blas_c
.
CGer
,
tensor
.
blas_scipy
.
ScipyGer
],
'fast_compile'
)
return
GpuGer
(
inplace
=
node
.
op
.
destructive
)
def
local_gpua_ger
(
op
,
context_name
,
inputs
,
outputs
):
return
GpuGer
(
inplace
=
op
.
destructive
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Dot22
])
@op_lifter
([
tensor
.
blas
.
Dot22
])
def
local_gpua_dot22
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
Dot22
],
'fast_compile'
)
def
local_gpua_dot22
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_dot22
return
gpu_dot22
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
blas
.
Dot22Scalar
])
@op_lifter
([
tensor
.
blas
.
Dot22Scalar
])
def
local_gpua_dot22scalar
(
node
,
context_name
):
@register_opt2
([
tensor
.
blas
.
Dot22Scalar
],
'fast_compile'
)
x
,
y
,
a
=
node
.
inputs
def
local_gpua_dot22scalar
(
op
,
context_name
,
inputs
,
outputs
):
x
,
y
,
a
=
inputs
x
=
as_gpuarray_variable
(
x
,
context_name
)
x
=
as_gpuarray_variable
(
x
,
context_name
)
y
=
as_gpuarray_variable
(
y
,
context_name
)
y
=
as_gpuarray_variable
(
y
,
context_name
)
z
=
GpuAllocEmpty
(
x
.
dtype
,
context_nam
e
)(
x
.
shape
[
0
],
y
.
shape
[
1
])
z
=
gpu_alloc_empty
(
context_name
,
dtype
=
x
.
dtyp
e
)(
x
.
shape
[
0
],
y
.
shape
[
1
])
return
[
gpugemm_no_inplace
(
z
,
a
,
x
,
y
,
0
)]
return
[
gpugemm_no_inplace
(
z
,
a
,
x
,
y
,
0
)]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
basic
.
Eye
])
@op_lifter
([
tensor
.
basic
.
Eye
])
def
local_gpua_eye
(
node
,
context_name
):
@register_opt2
([
tensor
.
basic
.
Eye
],
'fast_compile'
)
return
GpuEye
(
dtype
=
node
.
op
.
dtype
,
context_name
=
context_name
)
def
local_gpua_eye
(
op
,
context_name
,
inputs
,
outputs
):
return
GpuEye
(
dtype
=
op
.
dtype
,
context_name
=
context_name
)
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
cuda_only
=
True
)
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
node
,
context_name
):
@register_opt2
([
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
],
'fast_compile'
)
def
local_gpua_crossentropysoftmaxargmax1hotwithbias
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_crossentropy_softmax_argmax_1hot_with_bias
return
gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
cuda_only
=
True
)
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
node
,
context_name
):
@register_opt2
([
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
],
'fast_compile'
)
def
local_gpua_crossentropysoftmax1hotwithbiasdx
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_crossentropy_softmax_1hot_with_bias_dx
return
gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
Softmax
],
cuda_only
=
True
)
def
local_gpua_softmax
(
node
,
context_name
):
@register_opt2
([
tensor
.
nnet
.
Softmax
],
'fast_compile'
)
def
local_gpua_softmax
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_softmax
return
gpu_softmax
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
@op_lifter
([
tensor
.
nnet
.
SoftmaxWithBias
],
cuda_only
=
True
)
def
local_gpua_softmaxwithbias
(
node
,
context_name
):
@register_opt2
([
tensor
.
nnet
.
SoftmaxWithBias
],
'fast_compile'
)
def
local_gpua_softmaxwithbias
(
op
,
context_name
,
inputs
,
outputs
):
return
gpu_softmax_with_bias
return
gpu_softmax_with_bias
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
theano
.
tensor
.
opt
.
Assert
])
@op_lifter
([
theano
.
tensor
.
opt
.
Assert
])
def
local_assert
(
node
,
context_name
):
def
local_gpua_assert
(
op
,
context_name
,
inputs
,
outputs
):
if
isinstance
(
inputs
[
0
]
.
type
,
GpuArrayType
):
# Check if input nodes are already on the GPU
if
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
):
return
return
return
[
host_from_gpu
(
node
.
op
(
as_gpuarray_variable
(
node
.
inputs
[
0
],
return
local_gpua_assert_graph
(
op
,
context_name
,
inputs
,
outputs
)
context_name
),
*
node
.
inputs
[
1
:]))]
@register_opt2
([
theano
.
tensor
.
opt
.
Assert
],
'fast_compile'
)
def
local_gpua_assert_graph
(
op
,
context_name
,
inputs
,
outputs
):
return
[
op
(
as_gpuarray_variable
(
inputs
[
0
],
context_name
),
*
inputs
[
1
:])]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
ConvOp
])
@op_lifter
([
ConvOp
])
def
local_error_convop
(
node
,
context_name
):
@register_opt2
([
ConvOp
],
'fast_compile'
)
def
local_gpua_error_convop
(
op
,
context_name
,
inputs
,
outputs
):
assert
False
,
"""
assert
False
,
"""
ConvOp does not work with the gpuarray backend.
ConvOp does not work with the gpuarray backend.
...
@@ -912,8 +1242,9 @@ theano.tensor.nnet.conv2d()
...
@@ -912,8 +1242,9 @@ theano.tensor.nnet.conv2d()
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
SparseBlockGemv
])
@op_lifter
([
SparseBlockGemv
])
def
local_lift_sparseblockgemv
(
node
,
context_name
):
@register_opt2
([
SparseBlockGemv
],
'fast_compile'
)
if
node
.
op
.
inplace
:
def
local_gpua_sparseblockgemv
(
op
,
context_name
,
inputs
,
outputs
):
if
op
.
inplace
:
return
gpu_sparse_block_gemv_inplace
return
gpu_sparse_block_gemv_inplace
else
:
else
:
return
gpu_sparse_block_gemv
return
gpu_sparse_block_gemv
...
@@ -921,8 +1252,9 @@ def local_lift_sparseblockgemv(node, context_name):
...
@@ -921,8 +1252,9 @@ def local_lift_sparseblockgemv(node, context_name):
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
@op_lifter
([
SparseBlockOuter
])
@op_lifter
([
SparseBlockOuter
])
def
local_lift_sparseblockouter
(
node
,
context_name
):
@register_opt2
([
SparseBlockOuter
],
'fast_compile'
)
if
node
.
op
.
inplace
:
def
local_gpua_sparseblockouter
(
op
,
context_name
,
inputs
,
outputs
):
if
op
.
inplace
:
return
gpu_sparse_block_outer_inplace
return
gpu_sparse_block_outer_inplace
else
:
else
:
return
gpu_sparse_block_outer
return
gpu_sparse_block_outer
...
@@ -943,20 +1275,27 @@ def local_inplace_sparseblockouter(node):
...
@@ -943,20 +1275,27 @@ def local_inplace_sparseblockouter(node):
# This deals with any abstract convs that have a transfer somewhere
# This deals with any abstract convs that have a transfer somewhere
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
,
'conv_dnn'
,
'cudnn'
)
@op_lifter
([
AbstractConv2d
,
@op_lifter
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
])
AbstractConv2d_gradInputs
])
def
local_
lift_abstractconv2d
(
node
,
context_name
):
def
local_
gpua_abstractconv2d
(
op
,
context_name
,
inputs
,
outputs
):
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
GpuArrayType
):
if
isinstance
(
outputs
[
0
]
.
type
,
GpuArrayType
):
# Don't handle this node here, it's already on the GPU.
# Don't handle this node here, it's already on the GPU.
return
return
inps
=
list
(
node
.
inputs
)
return
local_gpua_lift_abstractconv2d_graph
(
op
,
context_name
,
inputs
,
outputs
)
inps
[
0
]
=
as_gpuarray_variable
(
node
.
inputs
[
0
],
@register_opt2
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
],
'fast_compile'
,
'conv_dnn'
,
'cudnn'
)
def
local_gpua_lift_abstractconv2d_graph
(
op
,
context_name
,
inputs
,
outputs
):
inps
=
list
(
inputs
)
inps
[
0
]
=
as_gpuarray_variable
(
inputs
[
0
],
context_name
=
context_name
)
context_name
=
context_name
)
inps
[
1
]
=
as_gpuarray_variable
(
node
.
inputs
[
1
],
inps
[
1
]
=
as_gpuarray_variable
(
inputs
[
1
],
context_name
=
context_name
)
context_name
=
context_name
)
return
[
node
.
op
(
*
inps
)]
return
[
op
(
*
inps
)]
# Register this here so that it goes after the abstract lifting
# Register this here so that it goes after the abstract lifting
register_opt
(
'fast_compile'
)(
conv_groupopt
)
register_opt
(
'fast_compile'
)(
conv_groupopt
)
...
@@ -980,10 +1319,10 @@ def local_gpu_elemwise_careduce(node):
...
@@ -980,10 +1319,10 @@ def local_gpu_elemwise_careduce(node):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scalar
.
basic
.
Sqr
)):
isinstance
(
node
.
inputs
[
0
]
.
owner
.
op
.
scalar_op
,
scalar
.
basic
.
Sqr
)):
op
=
node
.
op
op
=
node
.
op
inp
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
inp
=
node
.
inputs
[
0
]
.
owner
.
inputs
[
0
]
return
[
GpuCAReduceC
uda
(
scalar_op
=
op
.
scalar_op
,
return
[
gpu_ca_reduce_c
uda
(
scalar_op
=
op
.
scalar_op
,
axis
=
op
.
axis
,
axis
=
op
.
axis
,
reduce_mask
=
op
.
reduce_mask
,
reduce_mask
=
op
.
reduce_mask
,
pre_scalar_op
=
scalar
.
basic
.
sqr
)(
inp
)]
pre_scalar_op
=
scalar
.
basic
.
sqr
)(
inp
)]
@local_optimizer
(
None
)
@local_optimizer
(
None
)
...
@@ -1063,35 +1402,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
...
@@ -1063,35 +1402,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt
(
'scan'
,
'fast_compile'
)
@register_opt
(
'scan'
,
'fast_compile'
)
@op_lifter
([
scan_op
.
Scan
])
@op_lifter
([
scan_op
.
Scan
])
def
local_scan_to_gpua
(
node
,
context_name
):
@register_opt2
([
scan_op
.
Scan
],
'fast_compile'
)
info
=
copy
.
deepcopy
(
node
.
op
.
info
)
def
local_gpua_scan_to_gpua
(
op
,
context_name
,
inputs
,
outputs
):
info
=
copy
.
deepcopy
(
op
.
info
)
if
info
.
get
(
'gpua'
,
False
):
if
info
.
get
(
'gpua'
,
False
):
return
return
info
[
'gpua'
]
=
True
info
[
'gpua'
]
=
True
nw_ins
=
[
node
.
inputs
[
0
]]
nw_ins
=
[
inputs
[
0
]]
e
=
(
1
+
e
=
(
1
+
node
.
op
.
n_seqs
+
op
.
n_seqs
+
node
.
op
.
n_mit_mot
+
op
.
n_mit_mot
+
node
.
op
.
n_mit_sot
+
op
.
n_mit_sot
+
node
.
op
.
n_sit_sot
+
op
.
n_sit_sot
+
node
.
op
.
n_shared_outs
)
op
.
n_shared_outs
)
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
inputs
[
1
:
e
]]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
inputs
[
1
:
e
]]
b
=
e
b
=
e
e
=
e
+
node
.
op
.
n_nit_sot
e
=
e
+
op
.
n_nit_sot
nw_ins
+=
node
.
inputs
[
b
:
e
]
nw_ins
+=
inputs
[
b
:
e
]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
inputs
[
e
:]]
nw_ins
+=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
inputs
[
e
:]]
scan_ins
=
[
tensor_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
inputs
]
scan_ins
=
[
tensor_to_gpu
(
x
,
context_name
)
for
x
in
op
.
inputs
]
# The inner output corresponding to the looping condition should not be
# The inner output corresponding to the looping condition should not be
# moved to the gpu
# moved to the gpu
if
node
.
op
.
info
[
'as_while'
]:
if
op
.
info
[
'as_while'
]:
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
outputs
[:
-
1
]]
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
op
.
outputs
[:
-
1
]]
scan_outs
+=
[
node
.
op
.
outputs
[
-
1
]]
scan_outs
+=
[
op
.
outputs
[
-
1
]]
else
:
else
:
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
node
.
op
.
outputs
]
scan_outs
=
[
safe_to_gpu
(
x
,
context_name
)
for
x
in
op
.
outputs
]
scan_outs
=
scan_utils
.
clone
(
scan_outs
=
scan_utils
.
clone
(
scan_outs
,
scan_outs
,
replace
=
list
(
zip
(
node
.
op
.
inputs
,
replace
=
list
(
zip
(
op
.
inputs
,
(
safe_to_cpu
(
x
)
for
x
in
scan_ins
))))
(
safe_to_cpu
(
x
)
for
x
in
scan_ins
))))
# We need to construct the hash here, because scan
# We need to construct the hash here, because scan
...
...
theano/gpuarray/opt_util.py
浏览文件 @
a24fd9bb
...
@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
...
@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
NotScalarConstantError
)
NotScalarConstantError
)
from
.basic_ops
import
GpuFromHost
,
HostFromGpu
,
GpuAllocEmpty
from
.basic_ops
import
GpuFromHost
,
HostFromGpu
,
GpuAllocEmpty
,
gpu_alloc_empty
from
.elemwise
import
GpuDimShuffle
,
GpuElemwise
from
.elemwise
import
GpuDimShuffle
,
GpuElemwise
_one
=
scal
.
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float32'
))
_one
=
scal
.
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float32'
))
...
@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
...
@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
if
(
alloc
.
owner
and
if
(
alloc
.
owner
and
isinstance
(
alloc
.
owner
.
op
,
GpuAllocEmpty
)
and
isinstance
(
alloc
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
alloc
.
clients
)
>
1
):
len
(
alloc
.
clients
)
>
1
):
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
,
alloc_op
=
gpu_alloc_empty
(
alloc
.
owner
.
op
.
context_name
,
dtype
=
alloc
.
owner
.
op
.
dtype
)
alloc
.
owner
.
op
.
context_name
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
return
maker
(
node
,
inputs
)
return
maker
(
node
,
inputs
)
return
opt
return
opt
...
...
theano/gpuarray/tests/test_abstractconv.py
浏览文件 @
a24fd9bb
...
@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
...
@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
if
not
dnn_available
(
test_ctx_name
):
if
not
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn_available
.
msg
)
raise
SkipTest
(
dnn_available
.
msg
)
mode
=
mode_with_gpu
mode
=
mode_with_gpu
if
fd
!=
(
1
,
1
):
if
fd
!=
(
1
,
1
):
raise
SkipTest
(
"Doesn't have CUDNN implementation"
)
raise
SkipTest
(
"Doesn't have CUDNN implementation"
)
o
=
self
.
get_output_shape
(
i
,
f
,
s
,
b
,
fd
)
o
=
self
.
get_output_shape
(
i
,
f
,
s
,
b
,
fd
)
self
.
run_fwd
(
inputs_shape
=
i
,
filters_shape
=
f
,
subsample
=
s
,
self
.
run_fwd
(
inputs_shape
=
i
,
filters_shape
=
f
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
...
...
theano/gpuarray/tests/test_basic_ops.py
浏览文件 @
a24fd9bb
...
@@ -396,7 +396,7 @@ def test_gpueye():
...
@@ -396,7 +396,7 @@ def test_gpueye():
k_symb
=
numpy
.
asarray
(
0
)
k_symb
=
numpy
.
asarray
(
0
)
out
=
T
.
eye
(
N_symb
,
M_symb
,
k_symb
,
dtype
=
dtype
)
out
=
T
.
eye
(
N_symb
,
M_symb
,
k_symb
,
dtype
=
dtype
)
f
=
theano
.
function
([
N_symb
,
M_symb
],
f
=
theano
.
function
([
N_symb
,
M_symb
],
out
,
T
.
stack
(
out
)
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
result
=
numpy
.
asarray
(
f
(
N
,
M
))
result
=
numpy
.
asarray
(
f
(
N
,
M
))
assert
numpy
.
allclose
(
result
,
numpy
.
eye
(
N
,
M_
,
dtype
=
dtype
))
assert
numpy
.
allclose
(
result
,
numpy
.
eye
(
N
,
M_
,
dtype
=
dtype
))
...
...
theano/gpuarray/tests/test_opt.py
浏览文件 @
a24fd9bb
...
@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
...
@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
ones
=
numpy
.
ones
((
2
,),
dtype
=
'float32'
)
ones
=
numpy
.
ones
((
2
,),
dtype
=
'float32'
)
# Test with 0 from CPU op.
# Test with 0 from CPU op.
# Should not be transfered as the only client is the output
a
=
tensor
.
alloc
(
z
,
i
)
a
=
tensor
.
alloc
(
z
,
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
and
topo
[
0
]
.
op
.
memset_0
assert
isinstance
(
topo
[
0
]
.
op
,
theano
.
tensor
.
Alloc
)
assert
(
numpy
.
asarray
(
f
(
6
))
==
0
)
.
all
()
# Test with 0 from CPU op.
# Should be transfered as it is used by another op.
a
=
tensor
.
alloc
(
z
,
i
)
f
=
theano
.
function
([
i
],
a
.
cumsum
(),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
(
numpy
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
(
numpy
.
asarray
(
f
(
6
))
==
0
)
.
all
()
# Test with 0
# Test with 0
...
@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
...
@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
ii
=
theano
.
tensor
.
iscalar
()
ii
=
theano
.
tensor
.
iscalar
()
# Test with vector
# Test with vector
# Should not be moved as the only client is the output
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
)
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
theano
.
tensor
.
AllocEmpty
)
# This return not initilized data, so we can only check the shape
assert
f
(
3
)
.
shape
==
(
3
,)
# Test with vector
# Should be moved
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
)
f
=
theano
.
function
([
i
],
a
.
cumsum
(),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
# This return not initilized data, so we can only check the shape
# This return not initilized data, so we can only check the shape
assert
f
(
3
)
.
shape
==
(
3
,)
assert
f
(
3
)
.
shape
==
(
3
,)
# Test with matrix
# Test with matrix
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
,
ii
)
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
,
ii
)
f
=
theano
.
function
([
i
,
ii
],
a
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
i
,
ii
],
a
.
cumsum
(
axis
=
0
)
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
2
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
# This return not initilized data, so we can only check the shape
# This return not initilized data, so we can only check the shape
assert
f
(
3
,
4
)
.
shape
==
(
3
,
4
)
assert
f
(
3
,
4
)
.
shape
==
(
3
,
4
)
...
@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
...
@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
])
# Our optimizer isn't smart enough to move to the GPU Elemwise.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert
any
([
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
for
node
in
topo
])
def
test_local_gpu_elemwise
():
def
test_local_gpu_elemwise
():
...
@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
...
@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
out
=
theano
.
tensor
.
tanh
(
ms
)
.
dot
(
ms
.
T
)
out
=
theano
.
tensor
.
tanh
(
ms
)
.
dot
(
ms
.
T
)
mode_local_assert
=
mode_with_gpu
.
including
(
"assert_no_cpu_op"
)
mode_local_assert
=
mode_with_gpu
.
including
(
"assert_no_cpu_op"
)
mode_local_assert
=
mode_local_assert
.
excluding
(
"local_gpu_elemwise"
)
mode_local_assert
=
mode_local_assert
.
excluding
(
"local_gpu
a
_elemwise"
)
old
=
theano
.
config
.
assert_no_cpu_op
old
=
theano
.
config
.
assert_no_cpu_op
old2
=
theano
.
config
.
on_opt_error
old2
=
theano
.
config
.
on_opt_error
...
...
theano/gpuarray/type.py
浏览文件 @
a24fd9bb
...
@@ -233,7 +233,7 @@ class GpuArrayType(Type):
...
@@ -233,7 +233,7 @@ class GpuArrayType(Type):
return
data
return
data
def
filter_variable
(
self
,
other
,
allow_convert
=
True
):
def
filter_variable
(
self
,
other
,
allow_convert
=
True
):
from
theano.gpuarray
import
GpuFromH
ost
from
theano.gpuarray
.basic_ops
import
gpu_from_h
ost
if
hasattr
(
other
,
'_as_GpuArrayVariable'
):
if
hasattr
(
other
,
'_as_GpuArrayVariable'
):
other
=
other
.
_as_GpuArrayVariable
(
self
.
context_name
)
other
=
other
.
_as_GpuArrayVariable
(
self
.
context_name
)
...
@@ -265,7 +265,7 @@ class GpuArrayType(Type):
...
@@ -265,7 +265,7 @@ class GpuArrayType(Type):
str
(
self
.
broadcastable
)))
str
(
self
.
broadcastable
)))
other
=
other2
other
=
other2
return
GpuFromH
ost
(
self
.
context_name
)(
other
)
return
gpu_from_h
ost
(
self
.
context_name
)(
other
)
@staticmethod
@staticmethod
def
values_eq
(
a
,
b
,
force_same_dtype
=
True
):
def
values_eq
(
a
,
b
,
force_same_dtype
=
True
):
...
...
theano/sandbox/rng_mrg.py
浏览文件 @
a24fd9bb
...
@@ -24,10 +24,11 @@ from . import multinomial
...
@@ -24,10 +24,11 @@ from . import multinomial
import
theano.sandbox.cuda
import
theano.sandbox.cuda
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda
import
GpuOp
from
theano.gpuarray.basic_ops
import
GpuKernelBase
,
Kernel
from
theano.gpuarray.basic_ops
import
GpuKernelBase
,
Kernel
,
infer_context_name
from
theano.gpuarray.type
import
GpuArrayType
from
theano.gpuarray.type
import
GpuArrayType
from
theano.gpuarray.fp16_help
import
write_w
from
theano.gpuarray.fp16_help
import
write_w
from
theano.gpuarray.opt
import
(
register_opt
as
register_gpua
,
from
theano.gpuarray.opt
import
(
register_opt
as
register_gpua
,
register_opt2
,
host_from_gpu
as
host_from_gpua
)
host_from_gpu
as
host_from_gpua
)
if
theano
.
sandbox
.
cuda
.
cuda_available
:
if
theano
.
sandbox
.
cuda
.
cuda_available
:
from
theano.sandbox.cuda
import
(
CudaNdarrayType
,
from
theano.sandbox.cuda
import
(
CudaNdarrayType
,
...
@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
...
@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
return
final_samples
return
final_samples
@register_opt2
([
mrg_uniform
],
'fast_compile'
)
def
local_gpua_mrg_graph
(
op
,
context_name
,
inputs
,
outputs
):
if
(
type
(
op
)
==
mrg_uniform
and
isinstance
(
inputs
[
0
]
.
type
,
GpuArrayType
)):
outs
=
GPUA_mrg_uniform
.
new
(
inputs
[
0
],
op
.
output_type
.
ndim
,
op
.
output_type
.
dtype
,
inputs
[
1
])
return
[
outs
[
0
],
host_from_gpua
(
outs
[
1
])]
@register_gpua
(
'fast_compile'
)
@register_gpua
(
'fast_compile'
)
@local_optimizer
([
mrg_uniform
])
@local_optimizer
([
mrg_uniform
])
def
local_gpua_mrg
(
node
):
def
local_gpua_mrg
(
node
):
# TODO : need description for function
context_name
=
infer_context_name
(
*
node
.
inputs
)
if
(
type
(
node
.
op
)
==
mrg_uniform
and
return
local_gpua_mrg_graph
(
node
.
op
,
context_name
,
node
.
inputs
,
node
.
outputs
)
isinstance
(
node
.
inputs
[
0
]
.
type
,
GpuArrayType
)):
outs
=
GPUA_mrg_uniform
.
new
(
node
.
inputs
[
0
],
node
.
op
.
output_type
.
ndim
,
node
.
op
.
output_type
.
dtype
,
node
.
inputs
[
1
])
return
[
outs
[
0
],
host_from_gpua
(
outs
[
1
])]
MRG_RNGs
=
(
mrg_uniform
,
GPU_mrg_uniform
,
GPUA_mrg_uniform
)
MRG_RNGs
=
(
mrg_uniform
,
GPU_mrg_uniform
,
GPUA_mrg_uniform
)
...
...
theano/scan_module/scan_utils.py
浏览文件 @
a24fd9bb
...
@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
...
@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
return
d
return
d
visited
.
add
(
out
)
visited
.
add
(
out
)
from
theano.sandbox
import
cuda
from
theano.sandbox
import
cuda
from
theano
import
gpuarray
from
theano.gpuarray.basic_ops
import
gpu_from_host
,
host_from_gpu
from
theano.gpuarray
import
pygpu_activated
from
theano.gpuarray.type
import
GpuArrayType
if
out
==
x
:
if
out
==
x
:
if
isinstance
(
x
.
type
,
cuda
.
CudaNdarrayType
):
if
isinstance
(
x
.
type
,
cuda
.
CudaNdarrayType
):
d
[
out
]
=
cuda
.
gpu_from_host
(
x_copy
)
d
[
out
]
=
cuda
.
gpu_from_host
(
x_copy
)
else
:
else
:
assert
isinstance
(
x
.
type
,
gpuarray
.
GpuArrayType
)
assert
isinstance
(
x
.
type
,
GpuArrayType
)
d
[
out
]
=
gpu
array
.
GpuFromH
ost
(
x
.
type
.
context_name
)(
x_copy
)
d
[
out
]
=
gpu
_from_h
ost
(
x
.
type
.
context_name
)(
x_copy
)
return
d
return
d
elif
out
.
owner
is
None
:
elif
out
.
owner
is
None
:
return
d
return
d
...
@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
...
@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
out
.
owner
.
inputs
==
[
x
]):
out
.
owner
.
inputs
==
[
x
]):
d
[
out
]
=
tensor
.
as_tensor_variable
(
x_copy
)
d
[
out
]
=
tensor
.
as_tensor_variable
(
x_copy
)
return
d
return
d
elif
(
gpuarray
.
pygpu_activated
and
elif
(
pygpu_activated
and
out
.
owner
.
op
==
gpuarray
.
host_from_gpu
and
out
.
owner
.
op
==
host_from_gpu
and
out
.
owner
.
inputs
==
[
x
]):
out
.
owner
.
inputs
==
[
x
]):
d
[
out
]
=
tensor
.
as_tensor_variable
(
x_copy
)
d
[
out
]
=
tensor
.
as_tensor_variable
(
x_copy
)
return
d
return
d
...
...
theano/tensor/basic.py
浏览文件 @
a24fd9bb
...
@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
...
@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
v
=
v
.
owner
.
inputs
[
0
]
v
=
v
.
owner
.
inputs
[
0
]
continue
continue
elif
isinstance
(
v
.
owner
.
op
,
theano
.
compile
.
ops
.
Shape_i
):
elif
isinstance
(
v
.
owner
.
op
,
theano
.
compile
.
ops
.
Shape_i
):
if
isinstance
(
v
.
owner
.
inputs
[
0
],
Constant
):
i
=
v
.
owner
.
op
.
i
return
numpy
.
asarray
(
inp
=
v
.
owner
.
inputs
[
0
]
v
.
owner
.
inputs
[
0
]
.
data
.
shape
[
v
.
owner
.
op
.
i
])
if
isinstance
(
inp
,
Constant
):
return
numpy
.
asarray
(
inp
.
data
.
shape
[
i
])
# The shape of a broadcastable dimension is 1
if
(
hasattr
(
inp
.
type
,
'broadcastable'
)
and
inp
.
type
.
broadcastable
[
i
]):
return
numpy
.
asarray
(
1
)
# Don't act as the constant_folding optimization here as this
# Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would
# fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow.
# mess with the stabilization optimization and be too slow.
...
@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
...
@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
sh
=
[
as_tensor_variable
(
s
)
for
s
in
shape
]
sh
=
[
as_tensor_variable
(
s
)
for
s
in
shape
]
bcast
=
[]
bcast
=
[]
for
i
,
s
in
enumerate
(
sh
):
for
i
,
s
in
enumerate
(
sh
):
if
config
.
exception_verbosity
==
'high'
:
def
err_str
():
s_as_str
=
'
\n
'
+
min_informative_str
(
s
)
if
config
.
exception_verbosity
==
'high'
:
else
:
return
'
\n
'
+
min_informative_str
(
s
)
s_as_str
=
str
(
s
)
else
:
return
str
(
s
)
if
s
.
type
.
dtype
[:
3
]
not
in
(
'int'
,
'uin'
):
if
s
.
type
.
dtype
[:
3
]
not
in
(
'int'
,
'uin'
):
s_as_str
=
err_str
()
raise
TypeError
(
'Shape arguments to Alloc must be integers, '
raise
TypeError
(
'Shape arguments to Alloc must be integers, '
'but argument
%
s is not for apply node:
%
s'
%
'but argument
%
s is not for apply node:
%
s'
%
(
i
,
s_as_str
))
(
i
,
s_as_str
))
if
s
.
ndim
!=
0
:
if
s
.
ndim
!=
0
:
s_as_str
=
err_str
()
raise
TypeError
(
raise
TypeError
(
"Each shape dimension to Alloc must be a scalar, "
,
"Each shape dimension to Alloc must be a scalar, "
,
'but dimension
%
s have
%
d dimensions for apply node:
%
s'
%
'but dimension
%
s have
%
d dimensions for apply node:
%
s'
%
...
...
theano/tensor/nnet/abstract_conv.py
浏览文件 @
a24fd9bb
...
@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
...
@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
"""
"""
bsize
,
imshp
=
image_shape
[
0
],
image_shape
[
2
:]
bsize
,
imshp
=
image_shape
[
0
],
image_shape
[
2
:]
nkern
,
kshp
=
kernel_shape
[
0
],
kernel_shape
[
2
:]
nkern
,
kshp
=
kernel_shape
[
0
],
kernel_shape
[
2
:]
if
filter_dilation
is
None
:
if
filter_dilation
is
None
:
filter_dilation
=
numpy
.
ones
(
len
(
subsample
),
dtype
=
'int'
)
filter_dilation
=
numpy
.
ones
(
len
(
subsample
),
dtype
=
'int'
)
if
isinstance
(
border_mode
,
tuple
):
if
isinstance
(
border_mode
,
tuple
):
out_shp
=
tuple
(
get_conv_shape_1axis
(
out_shp
=
tuple
(
get_conv_shape_1axis
(
imshp
[
i
],
kshp
[
i
],
border_mode
[
i
],
imshp
[
i
],
kshp
[
i
],
border_mode
[
i
],
...
@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
...
@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
pad
=
border_mode
pad
=
border_mode
if
pad
<
0
:
if
pad
<
0
:
raise
ValueError
(
"border_mode must be >= 0"
)
raise
ValueError
(
"border_mode must be >= 0"
)
out_shp
=
(
image_shape
+
2
*
pad
-
dil_kernel_shape
)
//
subsample
+
1
# In case of symbolic shape, we want to build the smallest graph
# (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
if
pad
==
0
:
out_shp
=
(
image_shape
-
dil_kernel_shape
)
else
:
out_shp
=
(
image_shape
+
2
*
pad
-
dil_kernel_shape
)
if
subsample
!=
1
:
out_shp
=
out_shp
//
subsample
out_shp
=
out_shp
+
1
return
out_shp
return
out_shp
...
...
theano/tensor/tests/test_basic.py
浏览文件 @
a24fd9bb
...
@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
...
@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
assert
get_scalar_constant_value
(
s
)
==
3
assert
get_scalar_constant_value
(
s
)
==
3
s
=
opt
.
Shape_i
(
1
)(
c
)
s
=
opt
.
Shape_i
(
1
)(
c
)
assert
get_scalar_constant_value
(
s
)
==
4
assert
get_scalar_constant_value
(
s
)
==
4
d
=
theano
.
shared
(
numpy
.
random
.
randn
(
1
,
1
),
broadcastable
=
(
True
,
True
))
f
=
theano
.
tensor
.
basic
.
ScalarFromTensor
()(
opt
.
Shape_i
(
0
)(
d
))
assert
get_scalar_constant_value
(
f
)
==
1
def
test_elemwise
(
self
):
def
test_elemwise
(
self
):
# We test only for a few elemwise, the list of all supported
# We test only for a few elemwise, the list of all supported
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论