提交 fcbd4a34 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Enable fusion of GpuElemwise ops.

上级 2bb9365e
......@@ -95,6 +95,19 @@ class GpuElemwise(Op):
sub=dict(fail='return;'))
res.tag.kcode = kcode
# Translate types for scalar composite ops (except complex).
support_code = """
#define npy_float64 ga_double
#define npy_float32 ga_float
#define npy_uint8 ga_ubyte
#define npy_int8 ga_byte
#define npy_uint16 ga_ushort
#define npy_int16 ga_short
#define npy_uint32 ga_uint
#define npy_int32 ga_int
#define npy_uint64 ga_ulong
#define npy_int64 ga_long
"""
try:
code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
if code:
......
......@@ -142,3 +142,27 @@ def local_gpu_elemwise(node):
return [host_from_gpu(gpu_elemwise)]
else:
return False
def max_inputs_to_GpuElemwise(node):
ptr_size = 8
int_size = 4
# we take the limit from CUDA for now
argument_limit = 232
ndim = node.inputs[0].type.ndim
# number of elements and shape
size_param_mandatory = (int_size * (ndim + 1)) + \
(ptr_size + int_size * ndim) * len(node.outputs)
nb_bytes_avail = argument_limit - size_param_mandatory
nb_bytes_per_input = ptr_size + ndim * int_size
max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
return max_nb_inputs
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise,
max_inputs_to_GpuElemwise)
optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fast_run', 'fusion', 'local_elemwise_fusion', 'gpu')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论