提交 fe87d02b authored 作者: Frederic Bastien's avatar Frederic Bastien

Move copy/addapt max_inputs_to_GpuElemwise and split_huge_add_or_mul

上级 0dc3913f
...@@ -41,6 +41,27 @@ def get_scal(dt): ...@@ -41,6 +41,27 @@ def get_scal(dt):
return scalar.get_scalar_type(dt) return scalar.get_scalar_type(dt)
def max_inputs_to_GpuElemwise(node_or_outputs):
if isinstance(node_or_outputs, Apply):
outputs = node_or_outputs.outputs
else:
outputs = node_or_outputs
ptr_size = 8
int_size = 4
# we take the limit from CUDA for now
argument_limit = 232
ndim = outputs[0].type.ndim
# number of elements and shape
size_param_mandatory = (int_size * (ndim + 1)) + \
(ptr_size + int_size * ndim) * len(outputs)
nb_bytes_avail = argument_limit - size_param_mandatory
nb_bytes_per_input = ptr_size + ndim * int_size
max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
return max_nb_inputs
class GpuElemwise(HideC, Elemwise): class GpuElemwise(HideC, Elemwise):
""" """
Elemwise on the GPU. Elemwise on the GPU.
...@@ -57,6 +78,9 @@ class GpuElemwise(HideC, Elemwise): ...@@ -57,6 +78,9 @@ class GpuElemwise(HideC, Elemwise):
items = str(sorted(self.inplace_pattern.items())) items = str(sorted(self.inplace_pattern.items()))
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items) return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def max_inputs(self, node_or_outputs):
return max_inputs_to_GpuElemwise(node_or_outputs)
def make_node(self, *inputs): def make_node(self, *inputs):
ctx_name = infer_context_name(*inputs) ctx_name = infer_context_name(*inputs)
inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs] inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
......
...@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, ...@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias, gpu_softmax) gpu_softmax_with_bias, gpu_softmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv) GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv,
max_inputs_to_GpuElemwise)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
...@@ -752,26 +753,38 @@ def local_gpua_elemwise(op, context_name, inputs, outputs): ...@@ -752,26 +753,38 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
# cpu. # cpu.
gpu_output = res(*new_inputs) gpu_output = res(*new_inputs)
return [gpu_output] return [gpu_output]
elif op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(outputs)
while len(inputs) > max_nb_inputs:
inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
return res(*inputs)
else: else:
return res return res
def max_inputs_to_GpuElemwise(node): def split_huge_add_or_mul(node):
ptr_size = 8 """
int_size = 4 For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code.
# we take the limit from CUDA for now We don't want node in the graph that can't execute
argument_limit = 232 as this break DebugMode.
ndim = node.inputs[0].type.ndim
# number of elements and shape
size_param_mandatory = (int_size * (ndim + 1)) + \
(ptr_size + int_size * ndim) * len(node.outputs)
nb_bytes_avail = argument_limit - size_param_mandatory This should not happen for other GpuElemwise as their is only the fusion
nb_bytes_per_input = ptr_size + ndim * int_size that can generate op with too much input and it check for that.
max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
return max_nb_inputs """
if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
if max_nb_inputs <= 1 and len(node.inputs) > 1:
return False
while len(node.inputs) > max_nb_inputs:
inner_op = []
for i in xrange(0,
len(node.inputs),
max_nb_inputs):
inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
node = node.op(*inner_op).owner
return node
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise, GpuElemwise,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论