Refactor fusion and make it fuse one more case.

Their was duplicat code to test the maximum number of parameter to a GpuElemwise op. When fusing GpuElemwise with a new inputs, we check that we don't bust the maximum number of inputs for that ops. In the past, we make the check after and their where case where we could have fused in some of the inputs.

Refactor fusion and make it fuse one more case.
93dfe094 · Frederic Bastien · 8a45c933 · 93dfe094 · 93dfe094 · 93dfe094
--- a/NEWS.txt
+++ b/NEWS.txt
+Trunk sin last release
+------
+ * Sparse type is now supported by the shape op and the ShapeFeature optimizer work correctly with them.
+ * fuse GpuElemwise more often(in the case where their is too many inputs that fusing all of them would bust the 256 bytes limits of parameter to gpu function)
 Theano 0.3 (2010-11-23)
 -----------------------

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -729,8 +729,47 @@ optdb.register('InplaceGpuBlasOpt',
            max_use_ratio=5),
               70.0, 'fast_run', 'inplace')
+def max_inputs_to_GpuElemwise(node):
+    """
+    return the maximum number of input this Apply node to an GpuElemwise can accept.
+    This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
+    This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
+    """
+    #TODO: detect the size of gpu pointeur and c int.
+    int_size = 8
+    ptr_size = 8
+    argument_limit = 256  # if was 240, with this note: 16 bytes are used for block and thread coords etc.
+    size_param_mandatory = int_size #for numels
+    size_param_mandatory += int_size *  node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
+    size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
+    nb_bytes_avail = argument_limit-size_param_mandatory
+    nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
+    max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
+    return max_nb_inputs
+def split_huge_add_or_mul(node):
+    """
+    For add and mul, it can happen that we have too much input
+    That will make nvcc fail compilation of our current code.
+    We don't want node in the graph that can't execute
+    as this break DebugMode.
+    This should not happen for other GpuElemwise as their is only the fusion
+    that can generate op with too much input and it check for that.
+    """
+    if node.op.scalar_op in (scal.add, scal.mul):
+        max_nb_inputs = max_inputs_to_GpuElemwise(node)
+        while len(node.inputs)>max_nb_inputs:
+            inner_op = []
+            for i in range(0,len(node.inputs),max_nb_inputs):
+                inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs]))
+            node = node.op(*inner_op).owner
+    return node
 #GpuElemwise fusion
-gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
+gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise, max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
@@ -775,42 +814,3 @@ def local_gpualloc(node):
        #if old_out.type != new_out.type:
            #import pdb; pdb.set_trace()
        return [new_out]
-def max_inputs_to_GpuElemwise(node):
-    """
-    return the maximum number of input this Apply node to an GpuElemwise can accept.
-    This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
-    This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
-    """
-    #TODO: detect the size of gpu pointeur and c int.
-    int_size = 8
-    ptr_size = 8
-    argument_limit = 256  # if was 240, with this note: 16 bytes are used for block and thread coords etc.
-    size_param_mandatory = int_size #for numels
-    size_param_mandatory += int_size *  node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
-    size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
-    nb_bytes_avail = argument_limit-size_param_mandatory
-    nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
-    max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
-    return max_nb_inputs
-def split_huge_add_or_mul(node):
-    """
-    For add and mul, it can happen that we have too much input
-    That will make nvcc fail compilation of our current code.
-    We don't want node in the graph that can't execute
-    as this break DebugMode.
-    This should not happen for other GpuElemwise as their is only the fusion
-    that can generate op with too much input and it check for that.
-    """
-    if node.op.scalar_op in (scal.add, scal.mul):
-        max_nb_inputs = max_inputs_to_GpuElemwise(node)
-        while len(node.inputs)>max_nb_inputs: 
-            inner_op = []
-            for i in range(0,len(node.inputs),max_nb_inputs):
-                inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs]))
-            node = node.op(*inner_op).owner
-    return node
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -155,6 +155,27 @@ def test_print_op():
    assert topo[3].op == cuda.host_from_gpu
    f(numpy.random.random((5,5)).astype('float32'))
+def test_huge_elemwise_fusion():
+    """ Test the the GpuElemwise fusion work correctly
+        We check that we fuse one node with part of its input
+        in case their is too many inputs and that would make it bust the 256
+        bytes limits.
+    """
+    shape = (3,4,5,6)
+    vars = [tensor.tanh(tensor.ftensor4()) for x in range(10)]
+    f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    #theano.printing.debugprint(f)
+    #for i, node in enumerate(topo):
+    #    print >> sys.stdout, i, node
+    assert len(topo)==10
+    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==2
+    assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Composite)
+    assert isinstance(topo[8].op.scalar_op,theano.scalar.basic.Composite)
+    #let debugmode catch errors
+    gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())
 def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""
    shape = (3,4)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2919,9 +2919,13 @@ for i in range(1,len(p64)): print i, 64[i]-p64[i-1]
 # ###############
 # # Loop fusion #
 # ###############
-def local_elemwise_fusion_op(OP):
+def local_elemwise_fusion_op(OP, max_input_fct = lambda node: 1024):
    """
    We parametrise it to make it work for Elemwise and GpuElemwise op.
+    :param OP: GpuElemwise or Elemwise class (the one that we want to fuse)
+    :param max_input_fct: a fct that return the maximum number of input that this elemwise can take(usefull for the GpuElemwise)
    """
    def local_fuse(node):
        """
@@ -2951,16 +2955,24 @@ def local_elemwise_fusion_op(OP):
        if not isinstance(node.op, OP):
            return False
-        nb_elemwise=0
        inputs=[]#inputs of the new Elemwise op.
        s_inputs = []#inputs of the new scalar op.
        s_g=[]#graph of scalar, what will by done in the inner loop.
+        # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
+        max_nb_input = max_input_fct(node)
+        #print len(node.inputs),max_nb_input
+        new_nb_input = len(node.inputs)
        for i in node.inputs:
            do_fusion = False
            catch = False
            tmp_input=[]#used to remove duplicate input.
            tmp_scalar=[]
-            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)==1:
+            if ((new_nb_input+1)<=max_nb_input
+                and i.owner
+                and isinstance(i.owner.op, OP)
+                and len(i.clients)==1):
                #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
                do_fusion=True
                try:
@@ -2988,7 +3000,7 @@ def local_elemwise_fusion_op(OP):
            if do_fusion:
                #we should not put duplicate input into s_inputs and inputs
-                nb_elemwise+=1
+                new_nb_input+=1
                inputs.extend(tmp_input)
                s_inputs.extend(tmp_scalar)
                s_g.append(s_op)
@@ -3002,7 +3014,7 @@ def local_elemwise_fusion_op(OP):
                s_g.append(s)
        #if no inputs have are an elemwise, there is nothing to fuse.
-        if nb_elemwise==0:
+        if new_nb_input==len(node.inputs):
    #        print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
            return False
@@ -3029,22 +3041,9 @@ def local_elemwise_fusion_op(OP):
        assert len(n.outputs)==1
        assert node.outputs[0].dtype==n.outputs[0].dtype
-        # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
+        if len(n.inputs)>max_nb_input:
-        # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
+            _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
-        if OP != T.Elemwise:
+            return False
-            argument_limit = 240  # 16 bytes are used for block and thread coords etc.
-            #TODO: read in from architecture to make this 4 or 8
-            int_size = 8
-            ptr_size = 8
-            argument_size = int_size #for numels
-            argument_size += int_size *  inputs[0].type.ndim # for the shape
-            argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
-            argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
-            if argument_size >= argument_limit:
-                _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
-                return False
-    #    print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
        #we fuse as many that we can at the same time to make debug mode faster
        #debug mode will be faster as it won't test all intermediate step.