generalizing split; using new version for huge add/mul

13bc34f1 · Faruk Ahmed · e17dcd35 · 13bc34f1
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -751,38 +751,27 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
        gpu_output = res(*new_inputs)
        return [gpu_output]
    elif op.scalar_op in (scalar.add, scalar.mul):
-        return split_huge_add_or_mul(outputs[0].owner, res).outputs
+        try:
+            return [split_inputs(inputs, max_inputs_to_GpuElemwise(outputs), res)]
+        except ValueError:
+            return False
    else:
        return res


-def split_huge_add_or_mul(node, op=None):
-    """
-    For add and mul, it can happen that we have too much input
-    That will make nvcc fail compilation of our current code.
-    We don't want node in the graph that can't execute
-    as this break DebugMode.
+def split_inputs(inputs, max_nb_inputs, op):
+    if max_nb_inputs <= 1 and len(inputs) > 1:
+        ValueError("Can not split nodes because inputs' dimensionality and/or \
+                    number of outputs is too large")

-    This should not happen for other GpuElemwise as their is only the fusion
-    that can generate op with too much input and it check for that.
+    while len(inputs) > max_nb_inputs:
+        inner_ops = []
+        for i in range(0, len(inputs), max_nb_inputs):
+            inner_ops.append(op(*inputs[i: i + max_nb_inputs]))
+        inputs = inner_ops
+
+    return op(*inputs)

-    """
-    if op is None:
-        op = node.op
-    if node.op.scalar_op in (scalar.add, scalar.mul):
-        max_nb_inputs = max_inputs_to_GpuElemwise(node)
-        if max_nb_inputs <= 1 and len(node.inputs) > 1:
-            return False
-        else:
-            while len(node.inputs) > max_nb_inputs:
-                inner_op = []
-                for i in range(0, len(node.inputs), max_nb_inputs):
-                    inner_op.append(op(*node.inputs[i: i + max_nb_inputs]))
-                # Reuse node.op because op(*inner_op) could fail if there is
-                # still too many inputs
-                node = node.op(*inner_op).owner
-    # Apply op() to make sure the returned node is op and not node.op
-    return op(*node.inputs).owner

 gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
    GpuElemwise,