adjusted the limits of loop_fusion to be suitable for 64bit architectures

d17d67bd · James Bergstra · 722802b5 · d17d67bd
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1927,16 +1927,16 @@ def local_elemwise_fusion(node):
    # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
    # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
    if True:
-        argument_limit = 200  # 256 didn't work, but a lower number did... so something funny
-        # is going on 
-        int_size = 4
-        ptr_size = 4
-        argument_size = 4 #for numels
+        argument_limit = 240  # 16 bytes are used for block and thread coords etc.
+        #TODO: read in from architecture to make this 4 or 8
+        int_size = 8
+        ptr_size = 8
+        argument_size = int_size #for numels
        argument_size += int_size *  inputs[0].type.ndim # for the shape
        argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
        argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
        if argument_size >= argument_limit:
-            _logger.warning('loop fusion failed because Op would exceed kernel argument limit.')
+            _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
            return False

 #    print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"