提交 cc93c290 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5559 from nouiz/gpuarray_elemwise

[CRASH] Fix crash of GpuElemwise that have too many inputs
...@@ -41,6 +41,48 @@ def get_scal(dt): ...@@ -41,6 +41,48 @@ def get_scal(dt):
return scalar.get_scalar_type(dt) return scalar.get_scalar_type(dt)
def max_inputs_to_GpuElemwise(node_or_outputs):
"""
Compute the maximum number of inputs that fit in a kernel call.
"""
if isinstance(node_or_outputs, Apply):
outputs = node_or_outputs.outputs
else:
outputs = node_or_outputs
n_out = len(outputs)
ndim = outputs[0].type.ndim
ptr_size = 8
# Even with call32, the interface does not change, and shapes,
# strides, and offset are passed as 64-bits (8 bytes)
int_size = 8
# we take the limit from CUDA for now
nb_bytes_total = 4096
# Regardless of the number of arguments, we have:
# - The total number of elements (int)
# - The shape (int) on each dimension
fixed_size = int_size + int_size * ndim
# Each argument (input or output) has:
# - 1 pointer (ptr)
# - 1 offset (int)
# - 1 stride (int) per dimension
# Even if the tensor ends up being contiguous, code for the
# non-contiguous case still needs to be generated.
param_size = ptr_size + int_size + int_size * ndim
# Remaining for inputs
nb_bytes_for_inputs = nb_bytes_total - fixed_size - param_size * n_out
# Maximum number of inputs
max_nb_inputs = nb_bytes_for_inputs // param_size
return max_nb_inputs
class GpuElemwise(HideC, Elemwise): class GpuElemwise(HideC, Elemwise):
""" """
Elemwise on the GPU. Elemwise on the GPU.
...@@ -57,6 +99,9 @@ class GpuElemwise(HideC, Elemwise): ...@@ -57,6 +99,9 @@ class GpuElemwise(HideC, Elemwise):
items = str(sorted(self.inplace_pattern.items())) items = str(sorted(self.inplace_pattern.items()))
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items) return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def max_inputs(self, node_or_outputs):
return max_inputs_to_GpuElemwise(node_or_outputs)
def make_node(self, *inputs): def make_node(self, *inputs):
ctx_name = infer_context_name(*inputs) ctx_name = infer_context_name(*inputs)
inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs] inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
...@@ -69,6 +114,10 @@ class GpuElemwise(HideC, Elemwise): ...@@ -69,6 +114,10 @@ class GpuElemwise(HideC, Elemwise):
if len(outputs) > 1: if len(outputs) > 1:
raise NotImplementedError() raise NotImplementedError()
if len(inputs) > max_inputs_to_GpuElemwise(outputs):
raise NotImplementedError(
"Can not make this GpuElemwise with that much inputs")
# Try to generate the kernel to catch SupportCodeErrors # Try to generate the kernel to catch SupportCodeErrors
scal_ins = [get_scal(i.dtype) for i in inputs] scal_ins = [get_scal(i.dtype) for i in inputs]
fake_node = self.scalar_op.make_node(*[i() for i in scal_ins]) fake_node = self.scalar_op.make_node(*[i() for i in scal_ins])
......
...@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, ...@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias, gpu_softmax) gpu_softmax_with_bias, gpu_softmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv) GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv,
max_inputs_to_GpuElemwise)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
...@@ -752,26 +753,37 @@ def local_gpua_elemwise(op, context_name, inputs, outputs): ...@@ -752,26 +753,37 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
# cpu. # cpu.
gpu_output = res(*new_inputs) gpu_output = res(*new_inputs)
return [gpu_output] return [gpu_output]
elif op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(outputs)
if max_nb_inputs > 1:
while len(inputs) > max_nb_inputs:
inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
return res(*inputs)
else: else:
return res return res
def max_inputs_to_GpuElemwise(node): def split_huge_add_or_mul(node):
ptr_size = 8 """
int_size = 4 For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code.
# we take the limit from CUDA for now We don't want node in the graph that can't execute
argument_limit = 232 as this break DebugMode.
ndim = node.inputs[0].type.ndim
# number of elements and shape
size_param_mandatory = (int_size * (ndim + 1)) + \
(ptr_size + int_size * ndim) * len(node.outputs)
nb_bytes_avail = argument_limit - size_param_mandatory This should not happen for other GpuElemwise as their is only the fusion
nb_bytes_per_input = ptr_size + ndim * int_size that can generate op with too much input and it check for that.
max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
return max_nb_inputs """
if node.op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
if max_nb_inputs <= 1 and len(node.inputs) > 1:
return False
while len(node.inputs) > max_nb_inputs:
inner_op = []
for i in range(0, len(node.inputs), max_nb_inputs):
inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
node = node.op(*inner_op).owner
return node
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise, GpuElemwise,
......
...@@ -18,7 +18,7 @@ from ..type import GpuArrayType, get_context ...@@ -18,7 +18,7 @@ from ..type import GpuArrayType, get_context
from pygpu import ndgpuarray as gpuarray from pygpu import ndgpuarray as gpuarray
# This is acutally a test for GpuElemwise # This is actually a test for GpuElemwise
class test_gpu_Broadcast(test_elemwise.test_Broadcast): class test_gpu_Broadcast(test_elemwise.test_Broadcast):
cop = GpuElemwise cop = GpuElemwise
ctype = GpuArrayType ctype = GpuArrayType
......
...@@ -19,7 +19,7 @@ from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise ...@@ -19,7 +19,7 @@ from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available from ..linalg import GpuCusolverSolve, cusolver_available
from .config import mode_with_gpu, test_ctx_name, SkipTest from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest
def test_local_assert(): def test_local_assert():
...@@ -448,6 +448,51 @@ def test_local_gpu_elemwise(): ...@@ -448,6 +448,51 @@ def test_local_gpu_elemwise():
utt.assert_allclose(out[1], a_v[::2] * c_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def test_many_arg_elemwise():
# this test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu
rng = np.random.RandomState([1, 2, 3])
for num_args in [75]:
for op_to_test in [theano.tensor.add, theano.tensor.mul]:
for nb_dim in [2, 3, 4, 5, 7]:
shapes = [rng.randint(1, 5) for i in range(nb_dim)]
args = [np.cast['float32'](rng.randn(*shapes))
for arg in range(0, num_args)]
symb_args = [theano.tensor.TensorType('float32',
(False,) * nb_dim)()
for arg in range(0, num_args)]
outputs = []
for mode in [mode_with_gpu, mode_without_gpu]:
# test the optijmization local_gpu_elemwise_0
f = theano.function(
symb_args, op_to_test(*symb_args),
mode=mode.excluding("local_gpu_elemwise_1"))
outputs.append(f(*args))
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise)
for node in f.maker.fgraph.apply_nodes])
# test the optijmization local_gpu_elemwise_1
f = theano.function(
symb_args,
GpuFromHost(test_ctx_name)(op_to_test(*symb_args)),
mode=mode.excluding("local_gpu_elemwise_0"))
out = f(*args)
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise)
for node in f.maker.fgraph.apply_nodes])
utt.assert_allclose(out, outputs[-1])
results_gpu, results_cpu = outputs
utt.assert_allclose(results_gpu, results_cpu)
def test_local_lift_abstractconv_gpu_shape(): def test_local_lift_abstractconv_gpu_shape():
prev = theano.config.on_opt_error prev = theano.config.on_opt_error
try: try:
......
...@@ -7347,18 +7347,23 @@ def local_add_mul_fusion(node): ...@@ -7347,18 +7347,23 @@ def local_add_mul_fusion(node):
s_op = node.op.scalar_op.__class__ s_op = node.op.scalar_op.__class__
new_inp = [] new_inp = []
fused = False fused = False
nb_inputs = len(node.inputs)
max_inputs = float('inf')
if hasattr(node.op, 'max_inputs'):
max_inputs = node.op.max_inputs(node)
for inp in node.inputs: for inp in node.inputs:
if (inp.owner and if (inp.owner and
isinstance(inp.owner.op, Elemwise) and isinstance(inp.owner.op, Elemwise) and
isinstance(inp.owner.op.scalar_op, s_op) and isinstance(inp.owner.op.scalar_op, s_op) and
# Do not duplicate the operation. # Do not duplicate the operation.
len(inp.clients) == 1): len(inp.clients) == 1 and
(nb_inputs + len(inp.owner.inputs) - 1) <= max_inputs):
new_inp.extend(inp.owner.inputs) new_inp.extend(inp.owner.inputs)
fused = True fused = True
else: else:
new_inp.append(inp) new_inp.append(inp)
# We ca not compare the number of inputs as Mul and Add could have # We can not compare the number of inputs as Mul and Add could have
# 0 or 1 inputs in some corner cases. # 0 or 1 inputs in some corner cases.
if fused: if fused:
output = node.op(*new_inp) output = node.op(*new_inp)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论