Merge pull request #5559 from nouiz/gpuarray_elemwise

[CRASH] Fix crash of GpuElemwise that have too many inputs

Merge pull request #5559 from nouiz/gpuarray_elemwise
cc93c290 · Frédéric Bastien · GitHub · f3844589 · a14dcfad · cc93c290
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -41,6 +41,48 @@ def get_scal(dt):
    return scalar.get_scalar_type(dt)


+def max_inputs_to_GpuElemwise(node_or_outputs):
+    """
+    Compute the maximum number of inputs that fit in a kernel call.
+    """
+    if isinstance(node_or_outputs, Apply):
+        outputs = node_or_outputs.outputs
+    else:
+        outputs = node_or_outputs
+
+    n_out = len(outputs)
+    ndim = outputs[0].type.ndim
+
+    ptr_size = 8
+    # Even with call32, the interface does not change, and shapes,
+    # strides, and offset are passed as 64-bits (8 bytes)
+    int_size = 8
+
+    # we take the limit from CUDA for now
+    nb_bytes_total = 4096
+
+    # Regardless of the number of arguments, we have:
+    # - The total number of elements (int)
+    # - The shape (int) on each dimension
+    fixed_size = int_size + int_size * ndim
+
+    # Each argument (input or output) has:
+    # - 1 pointer (ptr)
+    # - 1 offset (int)
+    # - 1 stride (int) per dimension
+    # Even if the tensor ends up being contiguous, code for the
+    # non-contiguous case still needs to be generated.
+    param_size = ptr_size + int_size + int_size * ndim
+
+    # Remaining for inputs
+    nb_bytes_for_inputs = nb_bytes_total - fixed_size - param_size * n_out
+
+    # Maximum number of inputs
+    max_nb_inputs = nb_bytes_for_inputs // param_size
+
+    return max_nb_inputs
+
+
 class GpuElemwise(HideC, Elemwise):
    """
    Elemwise on the GPU.
@@ -57,6 +99,9 @@ class GpuElemwise(HideC, Elemwise):
        items = str(sorted(self.inplace_pattern.items()))
        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)

+    def max_inputs(self, node_or_outputs):
+        return max_inputs_to_GpuElemwise(node_or_outputs)
+
    def make_node(self, *inputs):
        ctx_name = infer_context_name(*inputs)
        inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
@@ -69,6 +114,10 @@ class GpuElemwise(HideC, Elemwise):
        if len(outputs) > 1:
            raise NotImplementedError()

+        if len(inputs) > max_inputs_to_GpuElemwise(outputs):
+            raise NotImplementedError(
+                "Can not make this GpuElemwise with that much inputs")
+
        # Try to generate the kernel to catch SupportCodeErrors
        scal_ins = [get_scal(i.dtype) for i in inputs]
        fake_node = self.scalar_op.make_node(*[i() for i in scal_ins])

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -63,7 +63,8 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
                   gpu_softmax_with_bias, gpu_softmax)

 from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
-                       GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv)
+                       GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv,
+                       max_inputs_to_GpuElemwise)
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedSubtensor,
                        GpuAdvancedSubtensor1,
@@ -752,26 +753,37 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
        # cpu.
        gpu_output = res(*new_inputs)
        return [gpu_output]
+    elif op.scalar_op in (scalar.add, scalar.mul):
+        max_nb_inputs = max_inputs_to_GpuElemwise(outputs)
+        if max_nb_inputs > 1:
+            while len(inputs) > max_nb_inputs:
+                inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
+        return res(*inputs)
    else:
        return res


-def max_inputs_to_GpuElemwise(node):
-    ptr_size = 8
-    int_size = 4
-
-    # we take the limit from CUDA for now
-    argument_limit = 232
-    ndim = node.inputs[0].type.ndim
-    # number of elements and shape
-    size_param_mandatory = (int_size * (ndim + 1)) + \
-        (ptr_size + int_size * ndim) * len(node.outputs)
+def split_huge_add_or_mul(node):
+    """
+    For add and mul, it can happen that we have too much input
+    That will make nvcc fail compilation of our current code.
+    We don't want node in the graph that can't execute
+    as this break DebugMode.

-    nb_bytes_avail = argument_limit - size_param_mandatory
-    nb_bytes_per_input = ptr_size + ndim * int_size
-    max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
+    This should not happen for other GpuElemwise as their is only the fusion
+    that can generate op with too much input and it check for that.

-    return max_nb_inputs
+    """
+    if node.op.scalar_op in (scalar.add, scalar.mul):
+        max_nb_inputs = max_inputs_to_GpuElemwise(node)
+        if max_nb_inputs <= 1 and len(node.inputs) > 1:
+            return False
+        while len(node.inputs) > max_nb_inputs:
+            inner_op = []
+            for i in range(0, len(node.inputs), max_nb_inputs):
+                inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
+            node = node.op(*inner_op).owner
+    return node

 gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
    GpuElemwise,

--- a/theano/gpuarray/tests/test_elemwise.py
+++ b/theano/gpuarray/tests/test_elemwise.py
@@ -18,7 +18,7 @@ from ..type import GpuArrayType, get_context
 from pygpu import ndgpuarray as gpuarray


-# This is acutally a test for GpuElemwise
+# This is actually a test for GpuElemwise
 class test_gpu_Broadcast(test_elemwise.test_Broadcast):
    cop = GpuElemwise
    ctype = GpuArrayType

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -19,7 +19,7 @@ from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
 from ..subtensor import GpuSubtensor
 from ..linalg import GpuCusolverSolve, cusolver_available

-from .config import mode_with_gpu, test_ctx_name, SkipTest
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest


 def test_local_assert():
@@ -448,6 +448,51 @@ def test_local_gpu_elemwise():
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])


+def test_many_arg_elemwise():
+    # this test checks whether the + and * elemwise ops can handle
+    # extremely large numbers of arguments on gpu
+    rng = np.random.RandomState([1, 2, 3])
+
+    for num_args in [75]:
+        for op_to_test in [theano.tensor.add, theano.tensor.mul]:
+            for nb_dim in [2, 3, 4, 5, 7]:
+                shapes = [rng.randint(1, 5) for i in range(nb_dim)]
+                args = [np.cast['float32'](rng.randn(*shapes))
+                        for arg in range(0, num_args)]
+
+                symb_args = [theano.tensor.TensorType('float32',
+                                                      (False,) * nb_dim)()
+                             for arg in range(0, num_args)]
+
+                outputs = []
+                for mode in [mode_with_gpu, mode_without_gpu]:
+                    # test the optijmization local_gpu_elemwise_0
+                    f = theano.function(
+                        symb_args, op_to_test(*symb_args),
+                        mode=mode.excluding("local_gpu_elemwise_1"))
+                    outputs.append(f(*args))
+                    # assert that the test was done on the gpu.
+                    if mode is mode_with_gpu:
+                        assert any([isinstance(node.op, GpuElemwise)
+                                    for node in f.maker.fgraph.apply_nodes])
+
+                    # test the optijmization local_gpu_elemwise_1
+                    f = theano.function(
+                        symb_args,
+                        GpuFromHost(test_ctx_name)(op_to_test(*symb_args)),
+                        mode=mode.excluding("local_gpu_elemwise_0"))
+                    out = f(*args)
+                    # assert that the test was done on the gpu.
+                    if mode is mode_with_gpu:
+                        assert any([isinstance(node.op, GpuElemwise)
+                                    for node in f.maker.fgraph.apply_nodes])
+                    utt.assert_allclose(out, outputs[-1])
+
+                results_gpu, results_cpu = outputs
+
+                utt.assert_allclose(results_gpu, results_cpu)
+
+
 def test_local_lift_abstractconv_gpu_shape():
    prev = theano.config.on_opt_error
    try:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -7347,18 +7347,23 @@ def local_add_mul_fusion(node):
    s_op = node.op.scalar_op.__class__
    new_inp = []
    fused = False
+    nb_inputs = len(node.inputs)
+    max_inputs = float('inf')
+    if hasattr(node.op, 'max_inputs'):
+        max_inputs = node.op.max_inputs(node)
    for inp in node.inputs:
        if (inp.owner and
                isinstance(inp.owner.op, Elemwise) and
                isinstance(inp.owner.op.scalar_op, s_op) and
                # Do not duplicate the operation.
-                len(inp.clients) == 1):
+                len(inp.clients) == 1 and
+                (nb_inputs + len(inp.owner.inputs) - 1) <= max_inputs):
            new_inp.extend(inp.owner.inputs)
            fused = True
        else:
            new_inp.append(inp)

-    # We ca not compare the number of inputs as Mul and Add could have
+    # We can not compare the number of inputs as Mul and Add could have
    # 0 or 1 inputs in some corner cases.
    if fused:
        output = node.op(*new_inp)