Merge pull request #5852 from Faruk-Ahmed/split_elemwise_addmul

Adapt local_gpu_elemwise optimization of new gpuarray back-end to avoid number of inputs overflow with Elemwise<add,mul>. The current optimization was already splitting the input, but it was not using the method split_huge_add_or_mul because of new gpuarray lifter signature (see comment https://github.com/Theano/Theano/pull/5852#discussion_r114145523) The unit test for large number of inputs was invalid because it was testing the old back-end (theano.sandbox.cuda). It is now adapted to gpuarray lifter optimization function. The number of settings tested is reduced to lower the computation time while still making sure we test at least one case with no number of inputs overflow and at least one case with number of inputs overflow. split_huge_add_or_mul() is made more general so it can be used if any case like Elemwise<add,mul> occurs elsewhere.

Merge pull request #5852 from Faruk-Ahmed/split_elemwise_addmul
89aac420 · Xavier Bouthillier · GitHub · 354097d3 · 8073ca18 · 89aac420
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -753,36 +753,46 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
        gpu_output = res(*new_inputs)
        return [gpu_output]
    elif op.scalar_op in (scalar.add, scalar.mul):
-        max_nb_inputs = max_inputs_to_GpuElemwise(outputs)
+        try:
-        if max_nb_inputs > 1:
+            return [split_inputs(inputs, max_inputs_to_GpuElemwise(outputs), res)]
-            while len(inputs) > max_nb_inputs:
+        except ValueError:
-                inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
+            return False
-        return res(*inputs)
    else:
        return res
-def split_huge_add_or_mul(node):
+def split_inputs(inputs, max_nb_inputs, op):
    """
-    For add and mul, it can happen that we have too much input
+    For some ops like add and mul, a large number of inputs can make nvcc fail
-    That will make nvcc fail compilation of our current code.
+    compilation of our current code. We don't want node in the graph that can't
-    We don't want node in the graph that can't execute
+    execute as this break DebugMode.
-    as this break DebugMode.
    This should not happen for other GpuElemwise as their is only the fusion
    that can generate op with too much input and it check for that.
+    Parameters
+    ----------
+    inputs: List of theano variables.
+            List of inputs to node.
+    max_nb_inputs: int
+                   Maximum number of inputs the node can handle without
+                   compilation fail.
+    op : Theano operator instance.
+         Operator that should be used to rebuild the computation graph with smaller
+         number of inputs per node.
    """
-    if node.op.scalar_op in (scalar.add, scalar.mul):
+    if max_nb_inputs <= 1 and len(inputs) > 1:
-        max_nb_inputs = max_inputs_to_GpuElemwise(node)
+        raise ValueError("Can not split nodes because inputs' dimensionality and/or"
-        if max_nb_inputs <= 1 and len(node.inputs) > 1:
+                         " number of outputs is too large")
-            return False
-        while len(node.inputs) > max_nb_inputs:
+    while len(inputs) > max_nb_inputs:
-            inner_op = []
+        inner_ops = []
-            for i in range(0, len(node.inputs), max_nb_inputs):
+        for i in range(0, len(inputs), max_nb_inputs):
-                inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
+            inner_ops.append(op(*inputs[i: i + max_nb_inputs]))
-            node = node.op(*inner_op).owner
+        inputs = inner_ops
-    return node
+    return op(*inputs)
 gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
    GpuElemwise,

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..basic_ops import (
    GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
 from ..blas import GpuGemm
-from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
+from ..elemwise import (
+    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
 from ..subtensor import GpuSubtensor
 from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
@@ -460,13 +461,14 @@ def test_local_gpu_elemwise():
 def test_many_arg_elemwise():
-    # this test checks whether the + and * elemwise ops can handle
+    # This test checks whether the + and * elemwise ops can handle
-    # extremely large numbers of arguments on gpu
+    # extremely large numbers of arguments on gpu.
-    rng = np.random.RandomState([1, 2, 3])
-    for num_args in [75]:
+    rng = np.random.RandomState([1, 2, 3])
+    nb_of_inputs_overflows = []
+    for num_args in [64]:
        for op_to_test in [theano.tensor.add, theano.tensor.mul]:
-            for nb_dim in [2, 3, 4, 5, 7]:
+            for nb_dim in [2, 8]:
                shapes = [rng.randint(1, 5) for i in range(nb_dim)]
                args = [np.cast['float32'](rng.randn(*shapes))
                        for arg in range(0, num_args)]
@@ -477,32 +479,30 @@ def test_many_arg_elemwise():
                outputs = []
                for mode in [mode_with_gpu, mode_without_gpu]:
-                    # test the optijmization local_gpu_elemwise_0
+                    # test the optimization local_gpua_elemwise
-                    f = theano.function(
+                    output = op_to_test(*symb_args)
-                        symb_args, op_to_test(*symb_args),
+                    f = theano.function(symb_args, output, mode=mode)
-                        mode=mode.excluding("local_gpu_elemwise_1"))
                    outputs.append(f(*args))
                    # assert that the test was done on the gpu.
                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, GpuElemwise)
+                        nb_of_inputs_overflows.append(
-                                    for node in f.maker.fgraph.apply_nodes])
+                            max_inputs_to_GpuElemwise(output.owner) - num_args)
+                        nodelst = [node for node in f.maker.fgraph.apply_nodes]
-                    # test the optijmization local_gpu_elemwise_1
+                        assert any(isinstance(node.op, GpuElemwise)
-                    f = theano.function(
+                                   for node in nodelst)
-                        symb_args,
+                        assert not any(isinstance(node.op, Elemwise)
-                        GpuFromHost(test_ctx_name)(op_to_test(*symb_args)),
+                                       for node in nodelst
-                        mode=mode.excluding("local_gpu_elemwise_0"))
+                                       if not isinstance(node.op, GpuElemwise))
-                    out = f(*args)
-                    # assert that the test was done on the gpu.
-                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, GpuElemwise)
-                                    for node in f.maker.fgraph.apply_nodes])
-                    utt.assert_allclose(out, outputs[-1])
                results_gpu, results_cpu = outputs
                utt.assert_allclose(results_gpu, results_cpu)
+    # Make sure we test at least one case with no number of inputs overflow
+    assert any(overflow >= 0 for overflow in nb_of_inputs_overflows)
+    # Make sure we test at least one case with number of inputs overflow
+    assert any(overflow < 0 for overflow in nb_of_inputs_overflows)
 def test_not_useless_scalar_gpuelemwise():
    # We don't want to move elemwise on scalar on the GPU when the