Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
5fa5c9ba
提交
5fa5c9ba
authored
5月 07, 2025
作者:
ricardoV94
提交者:
Ricardo Vieira
5月 09, 2025
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Speedup python implementation of Blockwise
上级
51cda52b
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
195 行增加
和
72 行删除
+195
-72
op.py
pytensor/graph/op.py
+15
-2
op.py
pytensor/link/c/op.py
+10
-4
blockwise.py
pytensor/tensor/blockwise.py
+148
-65
test_blockwise.py
tests/tensor/test_blockwise.py
+22
-1
没有找到文件。
pytensor/graph/op.py
浏览文件 @
5fa5c9ba
...
@@ -502,7 +502,7 @@ class Op(MetaObject):
...
@@ -502,7 +502,7 @@ class Op(MetaObject):
self
,
self
,
node
:
Apply
,
node
:
Apply
,
storage_map
:
StorageMapType
,
storage_map
:
StorageMapType
,
compute_map
:
ComputeMapType
,
compute_map
:
ComputeMapType
|
None
,
no_recycling
:
list
[
Variable
],
no_recycling
:
list
[
Variable
],
debug
:
bool
=
False
,
debug
:
bool
=
False
,
)
->
ThunkType
:
)
->
ThunkType
:
...
@@ -513,13 +513,26 @@ class Op(MetaObject):
...
@@ -513,13 +513,26 @@ class Op(MetaObject):
"""
"""
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
node_compute_map
=
[
compute_map
[
r
]
for
r
in
node
.
outputs
]
if
debug
and
hasattr
(
self
,
"debug_perform"
):
if
debug
and
hasattr
(
self
,
"debug_perform"
):
p
=
node
.
op
.
debug_perform
p
=
node
.
op
.
debug_perform
else
:
else
:
p
=
node
.
op
.
perform
p
=
node
.
op
.
perform
if
compute_map
is
None
:
@is_thunk_type
def
rval
(
p
=
p
,
i
=
node_input_storage
,
o
=
node_output_storage
,
n
=
node
,
):
return
p
(
n
,
[
x
[
0
]
for
x
in
i
],
o
)
else
:
node_compute_map
=
[
compute_map
[
r
]
for
r
in
node
.
outputs
]
@is_thunk_type
@is_thunk_type
def
rval
(
def
rval
(
p
=
p
,
p
=
p
,
...
...
pytensor/link/c/op.py
浏览文件 @
5fa5c9ba
...
@@ -39,7 +39,7 @@ class COp(Op, CLinkerOp):
...
@@ -39,7 +39,7 @@ class COp(Op, CLinkerOp):
self
,
self
,
node
:
Apply
,
node
:
Apply
,
storage_map
:
StorageMapType
,
storage_map
:
StorageMapType
,
compute_map
:
ComputeMapType
,
compute_map
:
ComputeMapType
|
None
,
no_recycling
:
Collection
[
Variable
],
no_recycling
:
Collection
[
Variable
],
)
->
CThunkWrapperType
:
)
->
CThunkWrapperType
:
"""Create a thunk for a C implementation.
"""Create a thunk for a C implementation.
...
@@ -86,11 +86,17 @@ class COp(Op, CLinkerOp):
...
@@ -86,11 +86,17 @@ class COp(Op, CLinkerOp):
)
)
thunk
,
node_input_filters
,
node_output_filters
=
outputs
thunk
,
node_input_filters
,
node_output_filters
=
outputs
if
compute_map
is
None
:
rval
=
is_cthunk_wrapper_type
(
thunk
)
else
:
cm_entries
=
[
compute_map
[
o
]
for
o
in
node
.
outputs
]
@is_cthunk_wrapper_type
@is_cthunk_wrapper_type
def
rval
(
):
def
rval
(
thunk
=
thunk
,
cm_entries
=
cm_entries
):
thunk
()
thunk
()
for
o
in
node
.
output
s
:
for
entry
in
cm_entrie
s
:
compute_map
[
o
]
[
0
]
=
True
entry
[
0
]
=
True
rval
.
thunk
=
thunk
rval
.
thunk
=
thunk
rval
.
cthunk
=
thunk
.
cthunk
rval
.
cthunk
=
thunk
.
cthunk
...
...
pytensor/tensor/blockwise.py
浏览文件 @
5fa5c9ba
from
collections.abc
import
Sequence
from
collections.abc
import
Callable
,
Sequence
from
typing
import
Any
,
cast
from
typing
import
Any
,
cast
import
numpy
as
np
import
numpy
as
np
from
numpy
import
broadcast_shapes
,
empty
from
pytensor
import
config
from
pytensor
import
config
from
pytensor.compile.builders
import
OpFromGraph
from
pytensor.compile.builders
import
OpFromGraph
...
@@ -22,12 +23,111 @@ from pytensor.tensor.type import TensorType, tensor
...
@@ -22,12 +23,111 @@ from pytensor.tensor.type import TensorType, tensor
from
pytensor.tensor.utils
import
(
from
pytensor.tensor.utils
import
(
_parse_gufunc_signature
,
_parse_gufunc_signature
,
broadcast_static_dim_lengths
,
broadcast_static_dim_lengths
,
faster_broadcast_to
,
faster_ndindex
,
import_func_from_string
,
import_func_from_string
,
safe_signature
,
safe_signature
,
)
)
from
pytensor.tensor.variable
import
TensorVariable
from
pytensor.tensor.variable
import
TensorVariable
def
_vectorize_node_perform
(
core_node
:
Apply
,
batch_bcast_patterns
:
Sequence
[
tuple
[
bool
,
...
]],
batch_ndim
:
int
,
impl
:
str
|
None
,
)
->
Callable
:
"""Creates a vectorized `perform` function for a given core node.
Similar behavior of np.vectorize, but specialized for PyTensor Blockwise Op.
"""
storage_map
=
{
var
:
[
None
]
for
var
in
core_node
.
inputs
+
core_node
.
outputs
}
core_thunk
=
core_node
.
op
.
make_thunk
(
core_node
,
storage_map
,
None
,
[],
impl
=
impl
)
single_in
=
len
(
core_node
.
inputs
)
==
1
core_input_storage
=
[
storage_map
[
inp
]
for
inp
in
core_node
.
inputs
]
core_output_storage
=
[
storage_map
[
out
]
for
out
in
core_node
.
outputs
]
core_storage
=
core_input_storage
+
core_output_storage
def
vectorized_perform
(
*
args
,
batch_bcast_patterns
=
batch_bcast_patterns
,
batch_ndim
=
batch_ndim
,
single_in
=
single_in
,
core_thunk
=
core_thunk
,
core_input_storage
=
core_input_storage
,
core_output_storage
=
core_output_storage
,
core_storage
=
core_storage
,
):
if
single_in
:
batch_shape
=
args
[
0
]
.
shape
[:
batch_ndim
]
else
:
_check_runtime_broadcast_core
(
args
,
batch_bcast_patterns
,
batch_ndim
)
batch_shape
=
broadcast_shapes
(
*
(
arg
.
shape
[:
batch_ndim
]
for
arg
in
args
))
args
=
list
(
args
)
for
i
,
arg
in
enumerate
(
args
):
if
arg
.
shape
[:
batch_ndim
]
!=
batch_shape
:
args
[
i
]
=
faster_broadcast_to
(
arg
,
batch_shape
+
arg
.
shape
[
batch_ndim
:]
)
ndindex_iterator
=
faster_ndindex
(
batch_shape
)
# Call once to get the output shapes
try
:
# TODO: Pass core shape as input like BlockwiseWithCoreShape does?
index0
=
next
(
ndindex_iterator
)
except
StopIteration
:
raise
NotImplementedError
(
"vectorize with zero size not implemented"
)
else
:
for
core_input
,
arg
in
zip
(
core_input_storage
,
args
):
core_input
[
0
]
=
np
.
asarray
(
arg
[
index0
])
core_thunk
()
outputs
=
tuple
(
empty
(
batch_shape
+
core_output
[
0
]
.
shape
,
dtype
=
core_output
[
0
]
.
dtype
)
for
core_output
in
core_output_storage
)
for
output
,
core_output
in
zip
(
outputs
,
core_output_storage
):
output
[
index0
]
=
core_output
[
0
]
for
index
in
ndindex_iterator
:
for
core_input
,
arg
in
zip
(
core_input_storage
,
args
):
core_input
[
0
]
=
np
.
asarray
(
arg
[
index
])
core_thunk
()
for
output
,
core_output
in
zip
(
outputs
,
core_output_storage
):
output
[
index
]
=
core_output
[
0
]
# Clear storage
for
core_val
in
core_storage
:
core_val
[
0
]
=
None
return
outputs
return
vectorized_perform
def
_check_runtime_broadcast_core
(
numerical_inputs
,
batch_bcast_patterns
,
batch_ndim
):
# strict=None because we are in a hot loop
# We zip together the dimension lengths of each input and their broadcast patterns
for
dim_lengths_and_bcast
in
zip
(
*
[
zip
(
input
.
shape
[:
batch_ndim
],
batch_bcast_pattern
)
for
input
,
batch_bcast_pattern
in
zip
(
numerical_inputs
,
batch_bcast_patterns
)
],
):
# If for any dimension where an entry has dim_length != 1,
# and another a dim_length of 1 and broadcastable=False, we have runtime broadcasting.
if
(
any
(
d
!=
1
for
d
,
_
in
dim_lengths_and_bcast
)
and
(
1
,
False
)
in
dim_lengths_and_bcast
):
raise
ValueError
(
"Runtime broadcasting not allowed. "
"At least one input has a distinct batch dimension length of 1, but was not marked as broadcastable.
\n
"
"If broadcasting was intended, use `specify_broadcastable` on the relevant input."
)
class
Blockwise
(
Op
):
class
Blockwise
(
Op
):
"""Generalizes a core `Op` to work with batched dimensions.
"""Generalizes a core `Op` to work with batched dimensions.
...
@@ -308,7 +408,7 @@ class Blockwise(Op):
...
@@ -308,7 +408,7 @@ class Blockwise(Op):
return
rval
return
rval
def
_create_node_gufunc
(
self
,
node
)
->
Non
e
:
def
_create_node_gufunc
(
self
,
node
:
Apply
,
impl
)
->
Callabl
e
:
"""Define (or retrieve) the node gufunc used in `perform`.
"""Define (or retrieve) the node gufunc used in `perform`.
If the Blockwise or core_op have a `gufunc_spec`, the relevant numpy or scipy gufunc is used directly.
If the Blockwise or core_op have a `gufunc_spec`, the relevant numpy or scipy gufunc is used directly.
...
@@ -316,83 +416,66 @@ class Blockwise(Op):
...
@@ -316,83 +416,66 @@ class Blockwise(Op):
The gufunc is stored in the tag of the node.
The gufunc is stored in the tag of the node.
"""
"""
gufunc_spec
=
self
.
gufunc_spec
or
getattr
(
self
.
core_op
,
"gufunc_spec"
,
None
)
batch_ndim
=
self
.
batch_ndim
(
node
)
batch_bcast_patterns
=
[
if
gufunc_spec
is
not
None
:
inp
.
type
.
broadcastable
[:
batch_ndim
]
for
inp
in
node
.
inputs
gufunc
=
import_func_from_string
(
gufunc_spec
[
0
])
]
if
gufunc
is
None
:
if
(
gufunc_spec
:
=
self
.
gufunc_spec
or
getattr
(
self
.
core_op
,
"gufunc_spec"
,
None
)
)
is
not
None
:
core_func
=
import_func_from_string
(
gufunc_spec
[
0
])
if
core_func
is
None
:
raise
ValueError
(
f
"Could not import gufunc {gufunc_spec[0]} for {self}"
)
raise
ValueError
(
f
"Could not import gufunc {gufunc_spec[0]} for {self}"
)
else
:
if
len
(
node
.
outputs
)
==
1
:
# Wrap core_op perform method in numpy vectorize
n_outs
=
len
(
self
.
outputs_sig
)
def
gufunc
(
core_node
=
self
.
_create_dummy_core_node
(
node
.
inputs
)
*
inputs
,
inner_outputs_storage
=
[[
None
]
for
_
in
range
(
n_outs
)]
batch_bcast_patterns
=
batch_bcast_patterns
,
batch_ndim
=
batch_ndim
,
def
core_func
(
*
inner_inputs
,
core_node
=
core_node
,
inner_outputs_storage
=
inner_outputs_storage
,
):
):
self
.
core_op
.
perform
(
_check_runtime_broadcast_core
(
core_node
,
inputs
,
batch_bcast_patterns
,
batch_ndim
[
np
.
asarray
(
inp
)
for
inp
in
inner_inputs
],
inner_outputs_storage
,
)
)
return
(
core_func
(
*
inputs
),)
if
n_outs
==
1
:
return
inner_outputs_storage
[
0
][
0
]
else
:
else
:
return
tuple
(
r
[
0
]
for
r
in
inner_outputs_storage
)
gufunc
=
np
.
vectorize
(
core_func
,
signature
=
self
.
signature
)
def
gufunc
(
*
inputs
,
batch_bcast_patterns
=
batch_bcast_patterns
,
batch_ndim
=
batch_ndim
,
):
_check_runtime_broadcast_core
(
inputs
,
batch_bcast_patterns
,
batch_ndim
)
return
core_func
(
*
inputs
)
else
:
core_node
=
self
.
_create_dummy_core_node
(
node
.
inputs
)
# type: ignore
gufunc
=
_vectorize_node_perform
(
core_node
,
batch_bcast_patterns
=
batch_bcast_patterns
,
batch_ndim
=
self
.
batch_ndim
(
node
),
impl
=
impl
,
)
node
.
tag
.
gufunc
=
gufunc
return
gufunc
def
_check_runtime_broadcast
(
self
,
node
,
inputs
):
def
_check_runtime_broadcast
(
self
,
node
,
inputs
):
batch_ndim
=
self
.
batch_ndim
(
node
)
batch_ndim
=
self
.
batch_ndim
(
node
)
batch_bcast
=
[
pt_inp
.
type
.
broadcastable
[:
batch_ndim
]
for
pt_inp
in
node
.
inputs
]
_check_runtime_broadcast_core
(
inputs
,
batch_bcast
,
batch_ndim
)
# strict=False because we are in a hot loop
def
prepare_node
(
self
,
node
,
storage_map
,
compute_map
,
impl
=
None
):
for
dims_and_bcast
in
zip
(
node
.
tag
.
gufunc
=
self
.
_create_node_gufunc
(
node
,
impl
=
impl
)
*
[
zip
(
input
.
shape
[:
batch_ndim
],
sinput
.
type
.
broadcastable
[:
batch_ndim
],
strict
=
False
,
)
for
input
,
sinput
in
zip
(
inputs
,
node
.
inputs
,
strict
=
False
)
],
strict
=
False
,
):
if
any
(
d
!=
1
for
d
,
_
in
dims_and_bcast
)
and
(
1
,
False
)
in
dims_and_bcast
:
raise
ValueError
(
"Runtime broadcasting not allowed. "
"At least one input has a distinct batch dimension length of 1, but was not marked as broadcastable.
\n
"
"If broadcasting was intended, use `specify_broadcastable` on the relevant input."
)
def
perform
(
self
,
node
,
inputs
,
output_storage
):
def
perform
(
self
,
node
,
inputs
,
output_storage
):
gufunc
=
getattr
(
node
.
tag
,
"gufunc"
,
None
)
try
:
if
gufunc
is
None
:
# Cache it once per node
self
.
_create_node_gufunc
(
node
)
gufunc
=
node
.
tag
.
gufunc
gufunc
=
node
.
tag
.
gufunc
except
AttributeError
:
self
.
_check_runtime_broadcast
(
node
,
inputs
)
gufunc
=
node
.
tag
.
gufunc
=
self
.
_create_node_gufunc
(
node
,
impl
=
None
)
for
out_storage
,
result
in
zip
(
output_storage
,
gufunc
(
*
inputs
)):
res
=
gufunc
(
*
inputs
)
out_storage
[
0
]
=
result
if
not
isinstance
(
res
,
tuple
):
res
=
(
res
,)
# strict=False because we are in a hot loop
for
node_out
,
out_storage
,
r
in
zip
(
node
.
outputs
,
output_storage
,
res
,
strict
=
False
):
out_dtype
=
getattr
(
node_out
,
"dtype"
,
None
)
if
out_dtype
and
out_dtype
!=
r
.
dtype
:
r
=
np
.
asarray
(
r
,
dtype
=
out_dtype
)
out_storage
[
0
]
=
r
def
__str__
(
self
):
def
__str__
(
self
):
if
self
.
name
is
None
:
if
self
.
name
is
None
:
...
...
tests/tensor/test_blockwise.py
浏览文件 @
5fa5c9ba
...
@@ -12,10 +12,11 @@ from pytensor.gradient import grad
...
@@ -12,10 +12,11 @@ from pytensor.gradient import grad
from
pytensor.graph
import
Apply
,
Op
from
pytensor.graph
import
Apply
,
Op
from
pytensor.graph.replace
import
vectorize_node
from
pytensor.graph.replace
import
vectorize_node
from
pytensor.raise_op
import
assert_op
from
pytensor.raise_op
import
assert_op
from
pytensor.tensor
import
diagonal
,
log
,
ones_like
,
scalar
,
tensor
,
vector
from
pytensor.tensor
import
diagonal
,
dmatrix
,
log
,
ones_like
,
scalar
,
tensor
,
vector
from
pytensor.tensor.blockwise
import
Blockwise
,
vectorize_node_fallback
from
pytensor.tensor.blockwise
import
Blockwise
,
vectorize_node_fallback
from
pytensor.tensor.nlinalg
import
MatrixInverse
from
pytensor.tensor.nlinalg
import
MatrixInverse
from
pytensor.tensor.rewriting.blas
import
specialize_matmul_to_batched_dot
from
pytensor.tensor.rewriting.blas
import
specialize_matmul_to_batched_dot
from
pytensor.tensor.signal
import
convolve1d
from
pytensor.tensor.slinalg
import
(
from
pytensor.tensor.slinalg
import
(
Cholesky
,
Cholesky
,
Solve
,
Solve
,
...
@@ -484,6 +485,26 @@ def test_batched_mvnormal_logp_and_dlogp(mu_batch_shape, cov_batch_shape, benchm
...
@@ -484,6 +485,26 @@ def test_batched_mvnormal_logp_and_dlogp(mu_batch_shape, cov_batch_shape, benchm
benchmark
(
fn
,
*
test_values
)
benchmark
(
fn
,
*
test_values
)
def
test_small_blockwise_performance
(
benchmark
):
a
=
dmatrix
(
shape
=
(
7
,
128
))
b
=
dmatrix
(
shape
=
(
7
,
20
))
out
=
convolve1d
(
a
,
b
,
mode
=
"valid"
)
fn
=
pytensor
.
function
([
a
,
b
],
out
,
trust_input
=
True
)
assert
isinstance
(
fn
.
maker
.
fgraph
.
outputs
[
0
]
.
owner
.
op
,
Blockwise
)
rng
=
np
.
random
.
default_rng
(
495
)
a_test
=
rng
.
normal
(
size
=
a
.
type
.
shape
)
b_test
=
rng
.
normal
(
size
=
b
.
type
.
shape
)
np
.
testing
.
assert_allclose
(
fn
(
a_test
,
b_test
),
[
np
.
convolve
(
a_test
[
i
],
b_test
[
i
],
mode
=
"valid"
)
for
i
in
range
(
a_test
.
shape
[
0
])
],
)
benchmark
(
fn
,
a_test
,
b_test
)
def
test_cop_with_params
():
def
test_cop_with_params
():
matrix_assert
=
Blockwise
(
core_op
=
assert_op
,
signature
=
"(x1,x2),()->(x1,x2)"
)
matrix_assert
=
Blockwise
(
core_op
=
assert_op
,
signature
=
"(x1,x2),()->(x1,x2)"
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论