fix conflict

updates updates update fixes

fix conflict
06b6fcb7 · Faruk Ahmed · Faruk Ahmed · a5c029dc · 06b6fcb7 · 06b6fcb7
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -751,16 +751,12 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
        gpu_output = res(*new_inputs)
        return [gpu_output]
    elif op.scalar_op in (scalar.add, scalar.mul):
-        max_nb_inputs = max_inputs_to_GpuElemwise(outputs)
+        return split_huge_add_or_mul(outputs[0].owner, res).outputs
-        if max_nb_inputs > 1:
-            while len(inputs) > max_nb_inputs:
-                inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
-        return res(*inputs)
    else:
        return res
-def split_huge_add_or_mul(node):
+def split_huge_add_or_mul(node, op=None):
    """
    For add and mul, it can happen that we have too much input
    That will make nvcc fail compilation of our current code.
@@ -771,16 +767,19 @@ def split_huge_add_or_mul(node):
    that can generate op with too much input and it check for that.
    """
+    if op is None:
+        op = node.op
    if node.op.scalar_op in (scalar.add, scalar.mul):
        max_nb_inputs = max_inputs_to_GpuElemwise(node)
        if max_nb_inputs <= 1 and len(node.inputs) > 1:
            return False
-        while len(node.inputs) > max_nb_inputs:
+        else:
-            inner_op = []
+            while len(node.inputs) > max_nb_inputs:
-            for i in range(0, len(node.inputs), max_nb_inputs):
+                inner_op = []
-                inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
+                for i in range(0, len(node.inputs), max_nb_inputs):
-            node = node.op(*inner_op).owner
+                    inner_op.append(op(*node.inputs[i: i + max_nb_inputs]))
-    return node
+                node = node.op(*inner_op).owner
+    return op(*node.inputs).owner
 gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
    GpuElemwise,

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..basic_ops import (
    GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
 from ..blas import GpuGemm
-from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
+from ..elemwise import (GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise,
+    Elemwise, max_inputs_to_GpuElemwise)
 from ..subtensor import GpuSubtensor
 from ..linalg import GpuCusolverSolve, cusolver_available
@@ -450,14 +451,15 @@ def test_local_gpu_elemwise():
 def test_many_arg_elemwise():
-    # this test checks whether the + and * elemwise ops can handle
+    # This test checks whether the + and * elemwise ops can handle
-    # extremely large numbers of arguments on gpu
+    # extremely large numbers of arguments on gpu.
    rng = np.random.RandomState([1, 2, 3])
-    for num_args in [75]:
+    for num_args in [32, 64, 128]:
        for op_to_test in [theano.tensor.add, theano.tensor.mul]:
-            for nb_dim in [2, 3, 4, 5, 7]:
+            for nb_dim in [2, 4, 8]:
-                shapes = [rng.randint(1, 5) for i in range(nb_dim)]
+                shapes = [rng.randint(1, int(32 / nb_dim)) for i in range(nb_dim)]
                args = [np.cast['float32'](rng.randn(*shapes))
                        for arg in range(0, num_args)]
@@ -467,30 +469,20 @@ def test_many_arg_elemwise():
                outputs = []
                for mode in [mode_with_gpu, mode_without_gpu]:
-                    # test the optijmization local_gpu_elemwise_0
+                    # test the optimization local_gpua_elemwise
                    f = theano.function(
-                        symb_args, op_to_test(*symb_args),
+                        symb_args, op_to_test(*symb_args))
-                        mode=mode.excluding("local_gpu_elemwise_1"))
                    outputs.append(f(*args))
-                    # assert that the test was done on the gpu.
-                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, GpuElemwise)
-                                    for node in f.maker.fgraph.apply_nodes])
-                    # test the optijmization local_gpu_elemwise_1
-                    f = theano.function(
-                        symb_args,
-                        GpuFromHost(test_ctx_name)(op_to_test(*symb_args)),
-                        mode=mode.excluding("local_gpu_elemwise_0"))
-                    out = f(*args)
                    # assert that the test was done on the gpu.
                    if mode is mode_with_gpu:
-                        assert any([isinstance(node.op, GpuElemwise)
+                        nodelst = [node for node in f.maker.fgraph.apply_nodes]
-                                    for node in f.maker.fgraph.apply_nodes])
+                        assert any(isinstance(node.op, GpuElemwise)
-                    utt.assert_allclose(out, outputs[-1])
+                                   for node in nodelst)
+                        assert not any(isinstance(node.op, Elemwise)
+                                       for node in nodelst
+                                       if not isinstance(node.op, GpuElemwise))
                results_gpu, results_cpu = outputs
                utt.assert_allclose(results_gpu, results_cpu)