Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
cc93c290
提交
cc93c290
authored
2月 20, 2017
作者:
Frédéric Bastien
提交者:
GitHub
2月 20, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5559 from nouiz/gpuarray_elemwise
[CRASH] Fix crash of GpuElemwise that have too many inputs
上级
f3844589
a14dcfad
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
130 行增加
和
19 行删除
+130
-19
elemwise.py
theano/gpuarray/elemwise.py
+49
-0
opt.py
theano/gpuarray/opt.py
+27
-15
test_elemwise.py
theano/gpuarray/tests/test_elemwise.py
+1
-1
test_opt.py
theano/gpuarray/tests/test_opt.py
+46
-1
opt.py
theano/tensor/opt.py
+7
-2
没有找到文件。
theano/gpuarray/elemwise.py
浏览文件 @
cc93c290
...
@@ -41,6 +41,48 @@ def get_scal(dt):
...
@@ -41,6 +41,48 @@ def get_scal(dt):
return
scalar
.
get_scalar_type
(
dt
)
return
scalar
.
get_scalar_type
(
dt
)
def
max_inputs_to_GpuElemwise
(
node_or_outputs
):
"""
Compute the maximum number of inputs that fit in a kernel call.
"""
if
isinstance
(
node_or_outputs
,
Apply
):
outputs
=
node_or_outputs
.
outputs
else
:
outputs
=
node_or_outputs
n_out
=
len
(
outputs
)
ndim
=
outputs
[
0
]
.
type
.
ndim
ptr_size
=
8
# Even with call32, the interface does not change, and shapes,
# strides, and offset are passed as 64-bits (8 bytes)
int_size
=
8
# we take the limit from CUDA for now
nb_bytes_total
=
4096
# Regardless of the number of arguments, we have:
# - The total number of elements (int)
# - The shape (int) on each dimension
fixed_size
=
int_size
+
int_size
*
ndim
# Each argument (input or output) has:
# - 1 pointer (ptr)
# - 1 offset (int)
# - 1 stride (int) per dimension
# Even if the tensor ends up being contiguous, code for the
# non-contiguous case still needs to be generated.
param_size
=
ptr_size
+
int_size
+
int_size
*
ndim
# Remaining for inputs
nb_bytes_for_inputs
=
nb_bytes_total
-
fixed_size
-
param_size
*
n_out
# Maximum number of inputs
max_nb_inputs
=
nb_bytes_for_inputs
//
param_size
return
max_nb_inputs
class
GpuElemwise
(
HideC
,
Elemwise
):
class
GpuElemwise
(
HideC
,
Elemwise
):
"""
"""
Elemwise on the GPU.
Elemwise on the GPU.
...
@@ -57,6 +99,9 @@ class GpuElemwise(HideC, Elemwise):
...
@@ -57,6 +99,9 @@ class GpuElemwise(HideC, Elemwise):
items
=
str
(
sorted
(
self
.
inplace_pattern
.
items
()))
items
=
str
(
sorted
(
self
.
inplace_pattern
.
items
()))
return
"GpuElemwise{
%
s}
%
s<gpuarray>"
%
(
self
.
scalar_op
,
items
)
return
"GpuElemwise{
%
s}
%
s<gpuarray>"
%
(
self
.
scalar_op
,
items
)
def
max_inputs
(
self
,
node_or_outputs
):
return
max_inputs_to_GpuElemwise
(
node_or_outputs
)
def
make_node
(
self
,
*
inputs
):
def
make_node
(
self
,
*
inputs
):
ctx_name
=
infer_context_name
(
*
inputs
)
ctx_name
=
infer_context_name
(
*
inputs
)
inputs
=
[
as_gpuarray_variable
(
i
,
ctx_name
)
for
i
in
inputs
]
inputs
=
[
as_gpuarray_variable
(
i
,
ctx_name
)
for
i
in
inputs
]
...
@@ -69,6 +114,10 @@ class GpuElemwise(HideC, Elemwise):
...
@@ -69,6 +114,10 @@ class GpuElemwise(HideC, Elemwise):
if
len
(
outputs
)
>
1
:
if
len
(
outputs
)
>
1
:
raise
NotImplementedError
()
raise
NotImplementedError
()
if
len
(
inputs
)
>
max_inputs_to_GpuElemwise
(
outputs
):
raise
NotImplementedError
(
"Can not make this GpuElemwise with that much inputs"
)
# Try to generate the kernel to catch SupportCodeErrors
# Try to generate the kernel to catch SupportCodeErrors
scal_ins
=
[
get_scal
(
i
.
dtype
)
for
i
in
inputs
]
scal_ins
=
[
get_scal
(
i
.
dtype
)
for
i
in
inputs
]
fake_node
=
self
.
scalar_op
.
make_node
(
*
[
i
()
for
i
in
scal_ins
])
fake_node
=
self
.
scalar_op
.
make_node
(
*
[
i
()
for
i
in
scal_ins
])
...
...
theano/gpuarray/opt.py
浏览文件 @
cc93c290
...
@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
...
@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias
,
gpu_softmax
)
gpu_softmax_with_bias
,
gpu_softmax
)
from
.elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
GpuCAReduceCuda
,
from
.elemwise
import
(
GpuElemwise
,
GpuDimShuffle
,
GpuCAReduceCuda
,
GpuCAReduceCPY
,
gpu_ca_reduce_cuda
,
gpu_erfinv
,
gpu_erfcinv
)
GpuCAReduceCPY
,
gpu_ca_reduce_cuda
,
gpu_erfinv
,
gpu_erfcinv
,
max_inputs_to_GpuElemwise
)
from
.subtensor
import
(
GpuIncSubtensor
,
GpuSubtensor
,
from
.subtensor
import
(
GpuIncSubtensor
,
GpuSubtensor
,
GpuAdvancedSubtensor
,
GpuAdvancedSubtensor
,
GpuAdvancedSubtensor1
,
GpuAdvancedSubtensor1
,
...
@@ -752,26 +753,37 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
...
@@ -752,26 +753,37 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
# cpu.
# cpu.
gpu_output
=
res
(
*
new_inputs
)
gpu_output
=
res
(
*
new_inputs
)
return
[
gpu_output
]
return
[
gpu_output
]
elif
op
.
scalar_op
in
(
scalar
.
add
,
scalar
.
mul
):
max_nb_inputs
=
max_inputs_to_GpuElemwise
(
outputs
)
if
max_nb_inputs
>
1
:
while
len
(
inputs
)
>
max_nb_inputs
:
inputs
=
inputs
[:
-
max_nb_inputs
]
+
[
res
(
*
inputs
[
-
max_nb_inputs
:])]
return
res
(
*
inputs
)
else
:
else
:
return
res
return
res
def
max_inputs_to_GpuElemwise
(
node
):
def
split_huge_add_or_mul
(
node
):
ptr_size
=
8
"""
int_size
=
4
For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code.
# we take the limit from CUDA for now
We don't want node in the graph that can't execute
argument_limit
=
232
as this break DebugMode.
ndim
=
node
.
inputs
[
0
]
.
type
.
ndim
# number of elements and shape
size_param_mandatory
=
(
int_size
*
(
ndim
+
1
))
+
\
(
ptr_size
+
int_size
*
ndim
)
*
len
(
node
.
outputs
)
nb_bytes_avail
=
argument_limit
-
size_param_mandatory
This should not happen for other GpuElemwise as their is only the fusion
nb_bytes_per_input
=
ptr_size
+
ndim
*
int_size
that can generate op with too much input and it check for that.
max_nb_inputs
=
nb_bytes_avail
//
nb_bytes_per_input
return
max_nb_inputs
"""
if
node
.
op
.
scalar_op
in
(
scalar
.
add
,
scalar
.
mul
):
max_nb_inputs
=
max_inputs_to_GpuElemwise
(
node
)
if
max_nb_inputs
<=
1
and
len
(
node
.
inputs
)
>
1
:
return
False
while
len
(
node
.
inputs
)
>
max_nb_inputs
:
inner_op
=
[]
for
i
in
range
(
0
,
len
(
node
.
inputs
),
max_nb_inputs
):
inner_op
.
append
(
node
.
op
(
*
node
.
inputs
[
i
:
i
+
max_nb_inputs
]))
node
=
node
.
op
(
*
inner_op
)
.
owner
return
node
gpu_local_elemwise_fusion
=
tensor
.
opt
.
local_elemwise_fusion_op
(
gpu_local_elemwise_fusion
=
tensor
.
opt
.
local_elemwise_fusion_op
(
GpuElemwise
,
GpuElemwise
,
...
...
theano/gpuarray/tests/test_elemwise.py
浏览文件 @
cc93c290
...
@@ -18,7 +18,7 @@ from ..type import GpuArrayType, get_context
...
@@ -18,7 +18,7 @@ from ..type import GpuArrayType, get_context
from
pygpu
import
ndgpuarray
as
gpuarray
from
pygpu
import
ndgpuarray
as
gpuarray
# This is ac
ut
ally a test for GpuElemwise
# This is ac
tu
ally a test for GpuElemwise
class
test_gpu_Broadcast
(
test_elemwise
.
test_Broadcast
):
class
test_gpu_Broadcast
(
test_elemwise
.
test_Broadcast
):
cop
=
GpuElemwise
cop
=
GpuElemwise
ctype
=
GpuArrayType
ctype
=
GpuArrayType
...
...
theano/gpuarray/tests/test_opt.py
浏览文件 @
cc93c290
...
@@ -19,7 +19,7 @@ from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
...
@@ -19,7 +19,7 @@ from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from
..subtensor
import
GpuSubtensor
from
..subtensor
import
GpuSubtensor
from
..linalg
import
GpuCusolverSolve
,
cusolver_available
from
..linalg
import
GpuCusolverSolve
,
cusolver_available
from
.config
import
mode_with_gpu
,
test_ctx_name
,
SkipTest
from
.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
,
SkipTest
def
test_local_assert
():
def
test_local_assert
():
...
@@ -448,6 +448,51 @@ def test_local_gpu_elemwise():
...
@@ -448,6 +448,51 @@ def test_local_gpu_elemwise():
utt
.
assert_allclose
(
out
[
1
],
a_v
[::
2
]
*
c_v
[::
2
])
utt
.
assert_allclose
(
out
[
1
],
a_v
[::
2
]
*
c_v
[::
2
])
def
test_many_arg_elemwise
():
# this test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu
rng
=
np
.
random
.
RandomState
([
1
,
2
,
3
])
for
num_args
in
[
75
]:
for
op_to_test
in
[
theano
.
tensor
.
add
,
theano
.
tensor
.
mul
]:
for
nb_dim
in
[
2
,
3
,
4
,
5
,
7
]:
shapes
=
[
rng
.
randint
(
1
,
5
)
for
i
in
range
(
nb_dim
)]
args
=
[
np
.
cast
[
'float32'
](
rng
.
randn
(
*
shapes
))
for
arg
in
range
(
0
,
num_args
)]
symb_args
=
[
theano
.
tensor
.
TensorType
(
'float32'
,
(
False
,)
*
nb_dim
)()
for
arg
in
range
(
0
,
num_args
)]
outputs
=
[]
for
mode
in
[
mode_with_gpu
,
mode_without_gpu
]:
# test the optijmization local_gpu_elemwise_0
f
=
theano
.
function
(
symb_args
,
op_to_test
(
*
symb_args
),
mode
=
mode
.
excluding
(
"local_gpu_elemwise_1"
))
outputs
.
append
(
f
(
*
args
))
# assert that the test was done on the gpu.
if
mode
is
mode_with_gpu
:
assert
any
([
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
])
# test the optijmization local_gpu_elemwise_1
f
=
theano
.
function
(
symb_args
,
GpuFromHost
(
test_ctx_name
)(
op_to_test
(
*
symb_args
)),
mode
=
mode
.
excluding
(
"local_gpu_elemwise_0"
))
out
=
f
(
*
args
)
# assert that the test was done on the gpu.
if
mode
is
mode_with_gpu
:
assert
any
([
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
])
utt
.
assert_allclose
(
out
,
outputs
[
-
1
])
results_gpu
,
results_cpu
=
outputs
utt
.
assert_allclose
(
results_gpu
,
results_cpu
)
def
test_local_lift_abstractconv_gpu_shape
():
def
test_local_lift_abstractconv_gpu_shape
():
prev
=
theano
.
config
.
on_opt_error
prev
=
theano
.
config
.
on_opt_error
try
:
try
:
...
...
theano/tensor/opt.py
浏览文件 @
cc93c290
...
@@ -7347,18 +7347,23 @@ def local_add_mul_fusion(node):
...
@@ -7347,18 +7347,23 @@ def local_add_mul_fusion(node):
s_op
=
node
.
op
.
scalar_op
.
__class__
s_op
=
node
.
op
.
scalar_op
.
__class__
new_inp
=
[]
new_inp
=
[]
fused
=
False
fused
=
False
nb_inputs
=
len
(
node
.
inputs
)
max_inputs
=
float
(
'inf'
)
if
hasattr
(
node
.
op
,
'max_inputs'
):
max_inputs
=
node
.
op
.
max_inputs
(
node
)
for
inp
in
node
.
inputs
:
for
inp
in
node
.
inputs
:
if
(
inp
.
owner
and
if
(
inp
.
owner
and
isinstance
(
inp
.
owner
.
op
,
Elemwise
)
and
isinstance
(
inp
.
owner
.
op
,
Elemwise
)
and
isinstance
(
inp
.
owner
.
op
.
scalar_op
,
s_op
)
and
isinstance
(
inp
.
owner
.
op
.
scalar_op
,
s_op
)
and
# Do not duplicate the operation.
# Do not duplicate the operation.
len
(
inp
.
clients
)
==
1
):
len
(
inp
.
clients
)
==
1
and
(
nb_inputs
+
len
(
inp
.
owner
.
inputs
)
-
1
)
<=
max_inputs
):
new_inp
.
extend
(
inp
.
owner
.
inputs
)
new_inp
.
extend
(
inp
.
owner
.
inputs
)
fused
=
True
fused
=
True
else
:
else
:
new_inp
.
append
(
inp
)
new_inp
.
append
(
inp
)
# We ca not compare the number of inputs as Mul and Add could have
# We ca
n
not compare the number of inputs as Mul and Add could have
# 0 or 1 inputs in some corner cases.
# 0 or 1 inputs in some corner cases.
if
fused
:
if
fused
:
output
=
node
.
op
(
*
new_inp
)
output
=
node
.
op
(
*
new_inp
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论