Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
e752fc3d
提交
e752fc3d
authored
8月 04, 2024
作者:
Ricardo Vieira
提交者:
Ricardo Vieira
8月 21, 2024
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
CAReduce loop reordering C-impl
上级
00a8a883
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
418 行增加
和
188 行删除
+418
-188
elemwise.py
pytensor/tensor/elemwise.py
+101
-101
elemwise_cgen.py
pytensor/tensor/elemwise_cgen.py
+299
-74
math.py
pytensor/tensor/math.py
+18
-13
没有找到文件。
pytensor/tensor/elemwise.py
浏览文件 @
e752fc3d
from
copy
import
copy
from
copy
import
copy
from
textwrap
import
dedent
import
numpy
as
np
import
numpy
as
np
from
numpy.core.numeric
import
normalize_axis_tuple
from
numpy.core.numeric
import
normalize_axis_tuple
...
@@ -1448,15 +1449,16 @@ class CAReduce(COp):
...
@@ -1448,15 +1449,16 @@ class CAReduce(COp):
return
((),)
return
((),)
return
([
ishape
[
i
]
for
i
in
range
(
node
.
inputs
[
0
]
.
type
.
ndim
)
if
i
not
in
axis
],)
return
([
ishape
[
i
]
for
i
in
range
(
node
.
inputs
[
0
]
.
type
.
ndim
)
if
i
not
in
axis
],)
def
_c_all
(
self
,
node
,
name
,
inames
,
onames
,
sub
):
def
_c_all
(
self
,
node
,
name
,
input_names
,
output_names
,
sub
):
input
=
node
.
inputs
[
0
]
[
inp
]
=
node
.
inputs
output
=
node
.
outputs
[
0
]
[
out
]
=
node
.
outputs
ndim
=
inp
.
type
.
ndim
iname
=
inames
[
0
]
[
inp_name
]
=
input_names
oname
=
onames
[
0
]
[
out_name
]
=
output_names
i
dtype
=
input
.
type
.
dtype_specs
()[
1
]
i
np_dtype
=
inp
.
type
.
dtype_specs
()[
1
]
o
dtype
=
outp
ut
.
type
.
dtype_specs
()[
1
]
o
ut_dtype
=
o
ut
.
type
.
dtype_specs
()[
1
]
acc_dtype
=
getattr
(
self
,
"acc_dtype"
,
None
)
acc_dtype
=
getattr
(
self
,
"acc_dtype"
,
None
)
...
@@ -1464,100 +1466,97 @@ class CAReduce(COp):
...
@@ -1464,100 +1466,97 @@ class CAReduce(COp):
if
acc_dtype
==
"float16"
:
if
acc_dtype
==
"float16"
:
raise
MethodNotDefined
(
"no c_code for float16"
)
raise
MethodNotDefined
(
"no c_code for float16"
)
acc_type
=
TensorType
(
shape
=
node
.
outputs
[
0
]
.
type
.
shape
,
dtype
=
acc_dtype
)
acc_type
=
TensorType
(
shape
=
node
.
outputs
[
0
]
.
type
.
shape
,
dtype
=
acc_dtype
)
adtype
=
acc_type
.
dtype_specs
()[
1
]
a
cc_
dtype
=
acc_type
.
dtype_specs
()[
1
]
else
:
else
:
a
dtype
=
o
dtype
a
cc_dtype
=
out_
dtype
axis
=
self
.
axis
axis
=
self
.
axis
if
axis
is
None
:
if
axis
is
None
:
axis
=
list
(
range
(
inp
ut
.
type
.
ndim
))
axis
=
list
(
range
(
inp
.
type
.
ndim
))
if
len
(
axis
)
==
0
:
if
len
(
axis
)
==
0
:
# This is just an Elemwise cast operation
# The acc_dtype is never a downcast compared to the input dtype
# The acc_dtype is never a downcast compared to the input dtype
# So we just need a cast to the output dtype.
# So we just need a cast to the output dtype.
var
=
pytensor
.
tensor
.
basic
.
cast
(
inp
ut
,
node
.
outputs
[
0
]
.
dtype
)
var
=
pytensor
.
tensor
.
basic
.
cast
(
inp
,
node
.
outputs
[
0
]
.
dtype
)
if
var
is
inp
ut
:
if
var
is
inp
:
var
=
Elemwise
(
scalar_identity
)(
inp
ut
)
var
=
Elemwise
(
scalar_identity
)(
inp
)
assert
var
.
dtype
==
node
.
outputs
[
0
]
.
dtype
assert
var
.
dtype
==
node
.
outputs
[
0
]
.
dtype
return
var
.
owner
.
op
.
_c_all
(
var
.
owner
,
name
,
inames
,
onames
,
sub
)
return
var
.
owner
.
op
.
_c_all
(
var
.
owner
,
name
,
input_names
,
output_names
,
sub
)
order1
=
[
i
for
i
in
range
(
input
.
type
.
ndim
)
if
i
not
in
axis
]
order
=
order1
+
list
(
axis
)
nnested
=
len
(
order1
)
inp_dims
=
list
(
range
(
ndim
))
non_reduced_dims
=
[
i
for
i
in
inp_dims
if
i
not
in
axis
]
counter
=
iter
(
range
(
ndim
))
acc_dims
=
[
"x"
if
i
in
axis
else
next
(
counter
)
for
i
in
range
(
ndim
)]
sub
=
dict
(
sub
)
sub
=
sub
.
copy
()
for
i
,
(
input
,
iname
)
in
enumerate
(
zip
(
node
.
inputs
,
inames
)):
sub
[
"lv0"
]
=
inp_name
sub
[
f
"lv{i}"
]
=
iname
sub
[
"lv1"
]
=
out_name
sub
[
"olv"
]
=
out_name
decl
=
""
if
acc_dtype
!=
out_dtype
:
if
adtype
!=
odtype
:
# Create an accumulator variable different from the output
# Create an accumulator variable different from the output
aname
=
"acc"
acc_name
=
"acc"
decl
=
acc_type
.
c_declare
(
aname
,
sub
)
setup
=
acc_type
.
c_declare
(
acc_name
,
sub
)
+
acc_type
.
c_init
(
acc_name
,
sub
)
decl
+=
acc_type
.
c_init
(
aname
,
sub
)
else
:
else
:
# the output is the accumulator variable
# the output is the accumulator variable
aname
=
oname
acc_name
=
out_name
setup
=
""
decl
+=
cgen
.
make_declare
([
order
],
[
idtype
],
sub
)
checks
=
cgen
.
make_checks
([
order
],
[
idtype
],
sub
)
# Define strides of input array
setup
+=
cgen
.
make_declare
(
alloc
=
""
[
inp_dims
],
[
inp_dtype
],
sub
,
compute_stride_jump
=
False
i
+=
1
)
+
cgen
.
make_checks
([
inp_dims
],
[
inp_dtype
],
sub
,
compute_stride_jump
=
False
)
sub
[
f
"lv{i}"
]
=
oname
sub
[
"olv"
]
=
oname
# Define strides of output array and allocate it
out_sub
=
sub
|
{
"lv0"
:
out_name
}
# Allocate output buffer
alloc
=
(
alloc
+=
cgen
.
make_declare
(
cgen
.
make_declare
(
[
list
(
range
(
nnested
))
+
[
"x"
]
*
len
(
axis
)],
[
odtype
],
dict
(
sub
,
lv0
=
oname
)
[
acc_dims
],
[
out_dtype
],
out_sub
,
compute_stride_jump
=
False
)
)
alloc
+=
cgen
.
make_alloc
([
order1
],
odtype
,
sub
)
+
cgen
.
make_alloc
([
non_reduced_dims
],
out_dtype
,
sub
)
alloc
+=
cgen
.
make_checks
(
+
cgen
.
make_checks
(
[
list
(
range
(
nnested
))
+
[
"x"
]
*
len
(
axis
)],
[
odtype
],
dict
(
sub
,
lv0
=
oname
)
[
acc_dims
],
[
out_dtype
],
out_sub
,
compute_stride_jump
=
False
)
)
)
if
a
dtype
!=
o
dtype
:
if
a
cc_dtype
!=
out_
dtype
:
#
Allocate accumulation buffer
#
Define strides of accumulation buffer and allocate it
sub
[
f
"lv{i}"
]
=
a
name
sub
[
"lv1"
]
=
acc_
name
sub
[
"olv"
]
=
aname
sub
[
"olv"
]
=
a
cc_
name
alloc
+=
cgen
.
make_declare
(
acc_sub
=
sub
|
{
"lv0"
:
acc_name
}
[
list
(
range
(
nnested
))
+
[
"x"
]
*
len
(
axis
)],
alloc
+=
(
[
adtype
],
cgen
.
make_declare
(
dict
(
sub
,
lv0
=
aname
),
[
acc_dims
],
[
acc_dtype
],
acc_sub
,
compute_stride_jump
=
False
)
)
alloc
+=
cgen
.
make_alloc
([
order1
],
adtype
,
sub
)
+
cgen
.
make_alloc
([
non_reduced_dims
],
acc_dtype
,
sub
)
alloc
+=
cgen
.
make_checks
(
+
cgen
.
make_checks
(
[
list
(
range
(
nnested
))
+
[
"x"
]
*
len
(
axis
)],
[
acc_dims
],
[
acc_dtype
],
acc_sub
,
compute_stride_jump
=
False
[
adtype
],
)
dict
(
sub
,
lv0
=
aname
),
)
)
identity
=
self
.
scalar_op
.
identity
identity
=
self
.
scalar_op
.
identity
if
np
.
isposinf
(
identity
):
if
np
.
isposinf
(
identity
):
if
inp
ut
.
type
.
dtype
in
(
"float32"
,
"float64"
):
if
inp
.
type
.
dtype
in
(
"float32"
,
"float64"
):
identity
=
"__builtin_inf()"
identity
=
"__builtin_inf()"
elif
inp
ut
.
type
.
dtype
.
startswith
(
"uint"
)
or
input
.
type
.
dtype
==
"bool"
:
elif
inp
.
type
.
dtype
.
startswith
(
"uint"
)
or
inp
.
type
.
dtype
==
"bool"
:
identity
=
"1"
identity
=
"1"
else
:
else
:
identity
=
"NPY_MAX_"
+
str
(
inp
ut
.
type
.
dtype
)
.
upper
()
identity
=
"NPY_MAX_"
+
str
(
inp
.
type
.
dtype
)
.
upper
()
elif
np
.
isneginf
(
identity
):
elif
np
.
isneginf
(
identity
):
if
inp
ut
.
type
.
dtype
in
(
"float32"
,
"float64"
):
if
inp
.
type
.
dtype
in
(
"float32"
,
"float64"
):
identity
=
"-__builtin_inf()"
identity
=
"-__builtin_inf()"
elif
inp
ut
.
type
.
dtype
.
startswith
(
"uint"
)
or
input
.
type
.
dtype
==
"bool"
:
elif
inp
.
type
.
dtype
.
startswith
(
"uint"
)
or
inp
.
type
.
dtype
==
"bool"
:
identity
=
"0"
identity
=
"0"
else
:
else
:
identity
=
"NPY_MIN_"
+
str
(
inp
ut
.
type
.
dtype
)
.
upper
()
identity
=
"NPY_MIN_"
+
str
(
inp
.
type
.
dtype
)
.
upper
()
elif
identity
is
None
:
elif
identity
is
None
:
raise
TypeError
(
f
"The {self.scalar_op} does not define an identity."
)
raise
TypeError
(
f
"The {self.scalar_op} does not define an identity."
)
task0_decl
=
f
"{adtype}& {aname}_i = *{aname}_iter;
\n
{aname}_i = {identity};"
initial_value
=
f
"{acc_name}_i = {identity};"
task1_decl
=
f
"{idtype}& {inames[0]}_i = *{inames[0]}_iter;
\n
"
task1_code
=
self
.
scalar_op
.
c_code
(
inner_task
=
self
.
scalar_op
.
c_code
(
Apply
(
Apply
(
self
.
scalar_op
,
self
.
scalar_op
,
[
[
...
@@ -1570,44 +1569,45 @@ class CAReduce(COp):
...
@@ -1570,44 +1569,45 @@ class CAReduce(COp):
],
],
),
),
None
,
None
,
[
f
"{a
name}_i"
,
f
"{inames[0]
}_i"
],
[
f
"{a
cc_name}_i"
,
f
"{inp_name
}_i"
],
[
f
"{aname}_i"
],
[
f
"{a
cc_
name}_i"
],
sub
,
sub
,
)
)
code1
=
f
"""
{{
{task1_decl}
{task1_code}
}}
"""
if
node
.
inputs
[
0
]
.
type
.
ndim
:
if
out
.
type
.
ndim
==
0
:
if
len
(
axis
)
==
1
:
# Simple case where everything is reduced, no need for loop ordering
all_code
=
[(
""
,
""
)]
*
nnested
+
[(
task0_decl
,
code1
),
""
]
loop
=
cgen
.
make_complete_loop_careduce
(
else
:
inp_var
=
inp_name
,
all_code
=
(
acc_var
=
acc_name
,
[(
""
,
""
)]
*
nnested
inp_dtype
=
inp_dtype
,
+
[(
task0_decl
,
""
)]
acc_dtype
=
acc_dtype
,
+
[(
""
,
""
)]
*
(
len
(
axis
)
-
2
)
initial_value
=
initial_value
,
+
[(
""
,
code1
),
""
]
inner_task
=
inner_task
,
)
fail_code
=
sub
[
"fail"
],
)
else
:
else
:
all_code
=
[
task0_decl
+
code1
]
loop
=
cgen
.
make_reordered_loop_careduce
(
loop
=
cgen
.
make_loop_careduce
(
inp_var
=
inp_name
,
[
order
,
list
(
range
(
nnested
))
+
[
"x"
]
*
len
(
axis
)],
acc_var
=
acc_name
,
[
idtype
,
adtype
],
inp_dtype
=
inp_dtype
,
all_code
,
acc_dtype
=
acc_dtype
,
sub
,
inp_ndim
=
ndim
,
)
reduction_axes
=
axis
,
initial_value
=
initial_value
,
inner_task
=
inner_task
,
)
end
=
""
if
acc_dtype
!=
out_dtype
:
if
adtype
!=
odtype
:
cast
=
dedent
(
end
=
f
"""
f
"""
PyArray_CopyInto({oname}, {aname});
PyArray_CopyInto({out_name}, {acc_name});
"""
{acc_type.c_cleanup(acc_name, sub)}
end
+=
acc_type
.
c_cleanup
(
aname
,
sub
)
"""
)
else
:
cast
=
""
return
decl
,
checks
,
alloc
,
loop
,
end
return
setup
,
alloc
,
loop
,
cast
def
c_code
(
self
,
node
,
name
,
inames
,
onames
,
sub
):
def
c_code
(
self
,
node
,
name
,
inames
,
onames
,
sub
):
code
=
"
\n
"
.
join
(
self
.
_c_all
(
node
,
name
,
inames
,
onames
,
sub
))
code
=
"
\n
"
.
join
(
self
.
_c_all
(
node
,
name
,
inames
,
onames
,
sub
))
...
@@ -1619,7 +1619,7 @@ class CAReduce(COp):
...
@@ -1619,7 +1619,7 @@ class CAReduce(COp):
def
c_code_cache_version_apply
(
self
,
node
):
def
c_code_cache_version_apply
(
self
,
node
):
# the version corresponding to the c code in this Op
# the version corresponding to the c code in this Op
version
=
[
9
]
version
=
[
10
]
# now we insert versions for the ops on which we depend...
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
scalar_node
=
Apply
(
...
...
pytensor/tensor/elemwise_cgen.py
浏览文件 @
e752fc3d
from
collections.abc
import
Sequence
from
textwrap
import
dedent
,
indent
from
textwrap
import
dedent
,
indent
from
pytensor.configdefaults
import
config
from
pytensor.configdefaults
import
config
def
make_declare
(
loop_orders
,
dtypes
,
sub
):
def
make_declare
(
loop_orders
,
dtypes
,
sub
,
compute_stride_jump
=
True
):
"""
"""
Produce code to declare all necessary variables.
Produce code to declare all necessary variables.
...
@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub):
...
@@ -20,13 +21,11 @@ def make_declare(loop_orders, dtypes, sub):
# the number of elements in that dimension,
# the number of elements in that dimension,
# the stride in that dimension,
# the stride in that dimension,
# and the jump from an iteration to the next
# and the jump from an iteration to the next
decl
+=
f
"""
decl
+=
f
"npy_intp {var}_n{value};
\n
ssize_t {var}_stride{value};
\n
"
npy_intp {var}_n{value};
if
compute_stride_jump
:
ssize_t {var}_stride{value};
decl
+=
f
"int {var}_jump{value}_{j};
\n
"
int {var}_jump{value}_{j};
"""
el
se
:
el
if
compute_stride_jump
:
# if the dimension is broadcasted, we only need
# if the dimension is broadcasted, we only need
# the jump (arbitrary length and stride = 0)
# the jump (arbitrary length and stride = 0)
decl
+=
f
"int {var}_jump{value}_{j};
\n
"
decl
+=
f
"int {var}_jump{value}_{j};
\n
"
...
@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub):
...
@@ -34,7 +33,7 @@ def make_declare(loop_orders, dtypes, sub):
return
decl
return
decl
def
make_checks
(
loop_orders
,
dtypes
,
sub
):
def
make_checks
(
loop_orders
,
dtypes
,
sub
,
compute_stride_jump
=
True
):
init
=
""
init
=
""
for
i
,
(
loop_order
,
dtype
)
in
enumerate
(
zip
(
loop_orders
,
dtypes
)):
for
i
,
(
loop_order
,
dtype
)
in
enumerate
(
zip
(
loop_orders
,
dtypes
)):
var
=
sub
[
f
"lv{i}"
]
var
=
sub
[
f
"lv{i}"
]
...
@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub):
...
@@ -67,13 +66,13 @@ def make_checks(loop_orders, dtypes, sub):
# Initialize the variables associated to the jth loop
# Initialize the variables associated to the jth loop
# jump = stride - adjust
# jump = stride - adjust
jump
=
f
"({var}_stride{index}) - ({adjust})"
jump
=
f
"({var}_stride{index}) - ({adjust})"
init
+=
f
"""
init
+=
f
"{var}_n{index} = PyArray_DIMS({var})[{index}];
\n
"
{var}_n{index} = PyArray_DIMS({var})[{index}];
init
+=
f
"{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});
\n
"
{var}_stride{index} = PyArray_STRIDES({var})[{index}] / sizeof({dtype});
if
compute_stride_jump
:
{var}_jump{index}_{j} = {jump};
init
+=
f
"{var}_jump{index}_{j} = {jump};
\n
"
"""
adjust
=
f
"{var}_n{index}*{var}_stride{index}"
adjust
=
f
"{var}_n{index}*{var}_stride{index}"
else
:
elif
compute_stride_jump
:
jump
=
f
"-({adjust})"
jump
=
f
"-({adjust})"
init
+=
f
"{var}_jump{index}_{j} = {jump};
\n
"
init
+=
f
"{var}_jump{index}_{j} = {jump};
\n
"
adjust
=
"0"
adjust
=
"0"
...
@@ -460,72 +459,298 @@ def make_reordered_loop(
...
@@ -460,72 +459,298 @@ def make_reordered_loop(
################
################
def
make_loop_careduce
(
loop_orders
,
dtypes
,
loop_tasks
,
sub
):
def
make_complete_loop_careduce
(
inp_var
:
str
,
acc_var
:
str
,
inp_dtype
:
str
,
acc_dtype
:
str
,
initial_value
:
str
,
inner_task
:
str
,
fail_code
,
)
->
str
:
"""Generate C code for a complete reduction loop.
The generated code for a float64 input variable `inp` and accumulation variable `acc` looks like:
.. code-block:: C
{
NpyIter* iter;
NpyIter_IterNextFunc *iternext;
char** data_ptr;
npy_intp* stride_ptr,* innersize_ptr;
// Special case for empty inputs
if (PyArray_SIZE(inp) == 0) {
npy_float64 acc_i = *(npy_float64*)(PyArray_DATA(acc));
acc_i = 0;
}else{
iter = NpyIter_New(inp,
NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {
NpyIter_Deallocate(iter);
{ fail }
}
data_ptr = NpyIter_GetDataPtrArray(iter);
stride_ptr = NpyIter_GetInnerStrideArray(iter);
innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
npy_float64 acc_i;
acc_i = 0;
do {
char* data = *data_ptr;
npy_intp stride = *stride_ptr;
npy_intp count = *innersize_ptr;
while(count--) {
npy_float64 inp_i = *((npy_float64*)data);
acc_i = acc_i + inp_i;
data += stride;
}
} while(iternext(iter));
NpyIter_Deallocate(iter);
*(npy_float64*)(PyArray_DATA(acc)) = acc_i;
}
}
"""
"""
Make a nested loop over several arrays and associate specific code
return
dedent
(
to each level of nesting.
f
"""
{{
NpyIter* iter;
NpyIter_IterNextFunc *iternext;
char** data_ptr;
npy_intp* stride_ptr,* innersize_ptr;
// Special case for empty inputs
if (PyArray_SIZE({inp_var}) == 0) {{
{acc_dtype} &{acc_var}_i = *({acc_dtype}*)(PyArray_DATA({acc_var}));
{initial_value}
}}else{{
iter = NpyIter_New({inp_var},
NPY_ITER_READONLY| NPY_ITER_EXTERNAL_LOOP| NPY_ITER_REFS_OK,
NPY_KEEPORDER,
NPY_NO_CASTING,
NULL);
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {{
NpyIter_Deallocate(iter);
{fail_code}
}}
Parameters
data_ptr = NpyIter_GetDataPtrArray(iter);
----------
stride_ptr = NpyIter_GetInnerStrideArray(iter);
loop_orders : list of N tuples of length M
innersize_ptr = NpyIter_GetInnerLoopSizePtr(iter);
Each value of each tuple can be either the index of a dimension to
loop over or the letter 'x' which means there is no looping to be done
over that variable at that point (in other words we broadcast
over that dimension). If an entry is an integer, it will become
an alias of the entry of that rank.
loop_tasks : list of M+1 pieces of code
The ith loop_task is a pair of strings, the first
string is code to be executed before the ith loop starts, the second
one contains code to be executed just before going to the next element
of the ith dimension.
The last element if loop_tasks is a single string, containing code
to be executed at the very end.
sub: dictionary
Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
"""
{acc_dtype} {acc_var}_i;
{initial_value}
def
loop_over
(
preloop
,
code
,
indices
,
i
):
do {{
iterv
=
f
"ITER_{int(i)}"
char* data = *data_ptr;
update
=
""
npy_intp stride = *stride_ptr;
suitable_n
=
"1"
npy_intp count = *innersize_ptr;
for
j
,
index
in
enumerate
(
indices
):
var
=
sub
[
f
"lv{int(j)}"
]
while(count--) {{
update
+=
f
"{var}_iter += {var}_jump{index}_{i};
\n
"
{inp_dtype} {inp_var}_i = *(({inp_dtype}*)data);
if
index
!=
"x"
:
{inner_task}
suitable_n
=
f
"{var}_n{index}"
data += stride;
return
f
"""
}}
{preloop}
}} while(iternext(iter));
for (int {iterv} = {suitable_n}; {iterv}; {iterv}--) {{
{code}
NpyIter_Deallocate(iter);
{update}
*({acc_dtype}*)(PyArray_DATA({acc_var})) = {acc_var}_i;
}}
}}
}}
"""
"""
)
preloops
=
{}
for
i
,
(
loop_order
,
dtype
)
in
enumerate
(
zip
(
loop_orders
,
dtypes
)):
for
j
,
index
in
enumerate
(
loop_order
):
if
index
!=
"x"
:
preloops
.
setdefault
(
j
,
""
)
preloops
[
j
]
+=
(
f
"
%
(lv{i})s_iter = ({dtype}*)(PyArray_DATA(
%
(lv{i})s));
\n
"
)
%
sub
break
else
:
# all broadcastable
preloops
.
setdefault
(
0
,
""
)
preloops
[
0
]
+=
(
f
"
%
(lv{i})s_iter = ({dtype}*)(PyArray_DATA(
%
(lv{i})s));
\n
"
)
%
sub
if
len
(
loop_tasks
)
==
1
:
def
make_reordered_loop_careduce
(
s
=
preloops
.
get
(
0
,
""
)
inp_var
:
str
,
else
:
acc_var
:
str
,
s
=
""
inp_dtype
:
str
,
for
i
,
(
pre_task
,
task
),
indices
in
reversed
(
acc_dtype
:
str
,
list
(
zip
(
range
(
len
(
loop_tasks
)
-
1
),
loop_tasks
,
list
(
zip
(
*
loop_orders
))))
inp_ndim
:
int
,
):
reduction_axes
:
Sequence
[
int
],
s
=
loop_over
(
preloops
.
get
(
i
,
""
)
+
pre_task
,
s
+
task
,
indices
,
i
)
initial_value
:
str
,
inner_task
:
str
,
)
->
str
:
"""Generate C code for a partial reduction loop, reordering for optimal memory access of the input variable.
The generated code for a sum along the last axis of a 2D float64 input variable `inp`
in an accumulation variable `acc` looks like:
.. code-block:: C
{
// Special case for empty inputs
if (PyArray_SIZE(inp) == 0) {
acc_iter = (npy_float64*)(PyArray_DATA(acc));
int_n = PyArray_SIZE(acc);
for(int i = 0; i < n; i++)
{
npy_float64 &acc_i = acc_iter[i];
acc_i = 0;
}
} else {
std::vector< std::pair<int, int> > loops(2);
std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
loops_it->first = abs(PyArray_STRIDES(inp)[0]);
loops_it->second = 0;
++loops_it;
loops_it->first = abs(PyArray_STRIDES(inp)[1]);
loops_it->second = 1;
++loops_it;
std::sort(loops.rbegin(), loops.rend());
int dim_lengths[2] = {inp_n0, inp_n1};
int inp_strides[2] = {inp_stride0, inp_stride1};
int acc_strides[2] = {acc_stride0, 0};
bool reduction_axes[2] = {0, 1};
loops_it = loops.begin();
int dim_length_0 = dim_lengths[loops_it->second];
int is_reduction_axis_0 = reduction_axes[loops_it->second];
int inp_stride_0 = inp_strides[loops_it->second];
int acc_stride_0 = acc_strides[loops_it->second];
++loops_it;
int dim_length_1 = dim_lengths[loops_it->second];
int is_reduction_axis_1 = reduction_axes[loops_it->second];
int inp_stride_1 = inp_strides[loops_it->second];
int acc_stride_1 = acc_strides[loops_it->second];
++loops_it;
inp_iter = (npy_float64*)(PyArray_DATA(inp));
acc_iter = (npy_float64*)(PyArray_DATA(acc));
for(int iter_0 = 0; iter_0<dim_length_0; iter_0++){
for(int iter_1 = 0; iter_1<dim_length_1; iter_1++){
npy_float64 &inp_i = *(inp_iter + inp_stride_1*iter_1 + inp_stride_0*iter_0);
npy_float64 &acc_i = *(acc_iter + acc_stride_1*iter_1 + acc_stride_0*iter_0);
if((!is_reduction_axis_0 || iter_0 == 0) && (!is_reduction_axis_1 || iter_1 == 0))
{
acc_i = 0;
}
{acc_i = acc_i + inp_i;}
}
}
}
s
+=
loop_tasks
[
-
1
]
"""
return
f
"{{{s}}}"
empty_case
=
dedent
(
f
"""
// Special case for empty inputs
if (PyArray_SIZE({inp_var}) == 0) {{
{acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
int n = PyArray_SIZE({acc_var});
for(int i = 0; i < n; i++)
{{
{acc_dtype} &{acc_var}_i = {acc_var}_iter[i];
{initial_value}
}}
}} else {{
"""
)
# The loops are ordered by (decreasing) absolute values of inp_var's strides.
# The first element of each pair is the absolute value of the stride
# The second element correspond to the index in the initial loop order
order_loops
=
dedent
(
f
"""
std::vector< std::pair<int, int> > loops({inp_ndim});
std::vector< std::pair<int, int> >::iterator loops_it = loops.begin();
"""
)
# Fill the loop vector with the appropriate <stride, index> pairs
for
i
in
range
(
inp_ndim
):
order_loops
+=
dedent
(
f
"""
loops_it->first = abs(PyArray_STRIDES({inp_var})[{i}]);
loops_it->second = {i};
++loops_it;"""
)
# We sort in decreasing order so that the outermost loop (loop 0)
# has the largest stride, and the innermost loop has the smallest stride.
order_loops
+=
"
\n
std::sort(loops.rbegin(), loops.rend());
\n
"
# Sort shape and strides to match the new order that was computed by sorting the loop vector.
counter
=
iter
(
range
(
inp_ndim
))
unsorted_vars
=
dedent
(
f
"""
int dim_lengths[{inp_ndim}] = {{{','.join(f'{inp_var}_n{i}' for i in range(inp_ndim))}}};
int inp_strides[{inp_ndim}] = {{{','.join(f'{inp_var}_stride{i}' for i in range(inp_ndim))}}};
int acc_strides[{inp_ndim}] = {{{','.join("0" if i in reduction_axes else f'{acc_var}_stride{next(counter)}'for i in range(inp_ndim))}}};
bool reduction_axes[{inp_ndim}] = {{{', '.join("1" if i in reduction_axes else "0" for i in range(inp_ndim))}}};
\n
"""
)
sorted_vars
=
"loops_it = loops.begin();"
for
i
in
range
(
inp_ndim
):
sorted_vars
+=
dedent
(
f
"""
int dim_length_{i} = dim_lengths[loops_it->second];
int is_reduction_axis_{i} = reduction_axes[loops_it->second];
int {inp_var}_stride_{i} = inp_strides[loops_it->second];
int {acc_var}_stride_{i} = acc_strides[loops_it->second];
++loops_it;
"""
)
declare_iter
=
dedent
(
f
"""
{inp_var}_iter = ({inp_dtype}*)(PyArray_DATA({inp_var}));
{acc_var}_iter = ({acc_dtype}*)(PyArray_DATA({acc_var}));
"""
)
pointer_update
=
""
for
var
,
dtype
in
((
inp_var
,
inp_dtype
),
(
acc_var
,
acc_dtype
)):
pointer_update
+=
f
"{dtype} &{var}_i = *({var}_iter"
for
i
in
reversed
(
tuple
(
range
(
inp_ndim
))):
iter_var
=
f
"iter_{i}"
pointer_update
+=
f
" + {var}_stride_{i}*{iter_var}"
pointer_update
+=
");
\n
"
# Set initial value in first iteration of each output
# This happens on the first iteration of every reduction axis
initial_iteration
=
" && "
.
join
(
f
"(!is_reduction_axis_{i} || iter_{i} == 0)"
for
i
in
range
(
inp_ndim
)
)
set_initial_value
=
dedent
(
f
"""
if({initial_iteration})
{{
{initial_value}
}}
"""
)
# We set do pointer_update, initial_value and inner task in inner loop
loop
=
"
\n\n
"
.
join
((
pointer_update
,
set_initial_value
,
f
"{{{inner_task}}}"
))
# Create outer loops recursively
for
i
in
reversed
(
range
(
inp_ndim
)):
iter_var
=
f
"iter_{i}"
dim_length
=
f
"dim_length_{i}"
loop
=
dedent
(
f
"""
for(int {iter_var} = 0; {iter_var}<{dim_length}; {iter_var}++){{
{loop}
}}
"""
)
non_empty_case
=
"
\n
"
.
join
(
(
order_loops
,
unsorted_vars
,
sorted_vars
,
declare_iter
,
loop
)
)
code
=
"
\n
"
.
join
((
empty_case
,
non_empty_case
,
"}"
))
return
f
"{{
\n
{code}
\n
}}
\n
"
pytensor/tensor/math.py
浏览文件 @
e752fc3d
import
builtins
import
builtins
import
warnings
import
warnings
from
collections.abc
import
Sequence
from
collections.abc
import
Sequence
from
textwrap
import
dedent
from
typing
import
TYPE_CHECKING
,
Optional
from
typing
import
TYPE_CHECKING
,
Optional
import
numpy
as
np
import
numpy
as
np
...
@@ -361,12 +362,14 @@ class FixedOpCAReduce(CAReduce):
...
@@ -361,12 +362,14 @@ class FixedOpCAReduce(CAReduce):
class
NonZeroDimsCAReduce
(
FixedOpCAReduce
):
class
NonZeroDimsCAReduce
(
FixedOpCAReduce
):
def
_c_all
(
self
,
node
,
name
,
inames
,
onames
,
sub
):
def
_c_all
(
self
,
node
,
name
,
input_names
,
output_names
,
sub
):
decl
,
checks
,
alloc
,
loop
,
end
=
super
()
.
_c_all
(
node
,
name
,
inames
,
onames
,
sub
)
setup
,
alloc
,
loop
,
cast
=
super
()
.
_c_all
(
node
,
name
,
input_names
,
output_names
,
sub
)
# We add an additional check for zero-sized dimensions (This seems like
# We add an additional check for zero-sized dimensions (This seems like
# something that could enabled in `elemwise_cgen.make_checks`.)
# something that could enabled in `elemwise_cgen.make_checks`.)
iname
=
inames
[
0
]
[
iname
]
=
input_names
axis
=
self
.
axis
axis
=
self
.
axis
if
axis
is
None
:
if
axis
is
None
:
...
@@ -378,17 +381,19 @@ class NonZeroDimsCAReduce(FixedOpCAReduce):
...
@@ -378,17 +381,19 @@ class NonZeroDimsCAReduce(FixedOpCAReduce):
pattern_
=
str
(
pattern
)[
1
:
-
1
]
pattern_
=
str
(
pattern
)[
1
:
-
1
]
decl
+=
f
"""int tosum[]={{{pattern_}}};"""
setup
=
f
"int tosum[]={{{pattern_}}};"
+
setup
alloc
+=
f
"""
alloc
+=
dedent
(
for(int i=0;i<PyArray_NDIM({iname});i++){{
f
"""
if(PyArray_DIMS({iname})[i]==0 && tosum[i]
){{
for(int i=0;i<PyArray_NDIM({iname});i++
){{
PyErr_Format(PyExc_ValueError,
if(PyArray_DIMS({iname})[i]==0 && tosum[i]){{
"Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis
%%
d",i);
PyErr_Format(PyExc_ValueError,
{sub["fail"]}
;
"Input of CAReduce{{{node.op.scalar_op}}} has zero-size on axis
%%
d",i)
;
}}
{sub["fail"]};
}}
}}
"""
}}
return
decl
,
checks
,
alloc
,
loop
,
end
"""
)
return
setup
,
alloc
,
loop
,
cast
class
Max
(
NonZeroDimsCAReduce
):
class
Max
(
NonZeroDimsCAReduce
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论