Fix crash when we genereted a gpu function with too much parameter.

This is a strange case as we restect the 256 bytes limits!

Fix crash when we genereted a gpu function with too much parameter.
923a23b1 · Frederic · a62957f5 · 923a23b1 · 923a23b1
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -943,9 +943,17 @@ def get_device_type_sizes():

 def max_inputs_to_GpuElemwise(node):
    """
-    return the maximum number of input this Apply node to an GpuElemwise can accept.
-    This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
-    This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
+    return the maximum number of input this GpuElemwise Apply node can
+    accept.
+
+    This is needed as currently their is a limit of 256 bytes of
+    paramter for the gpu function on device of compute capability
+    1.x. There is a 4k bytes limit on device compute capability
+    2.x (not used).
+
+    This mesure the number of paramter we put in our gpu function and
+    compute the maximum number of inputs that respect the 256 bytes
+    limits.
    """
    type_sizes = get_device_type_sizes()
    int_size = type_sizes['int_size']
@@ -961,6 +969,11 @@ def max_inputs_to_GpuElemwise(node):
    nb_bytes_avail = argument_limit - size_param_mandatory
    nb_bytes_per_inputs = (ndim*int_size) + gpu_ptr_size
    max_nb_inputs = nb_bytes_avail // nb_bytes_per_inputs
+
+    # There is a case this don't algo don't work. Is this related to
+    # the order of paramter to the gpu function?
+    if node.inputs[0].type.ndim==1 and max_nb_inputs>14:
+        return 14
    return max_nb_inputs

 def split_huge_add_or_mul(node):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -193,6 +193,41 @@ def test_huge_elemwise_fusion():
    gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
    f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())

+    def gen(shape):
+        return theano._asarray(numpy.random.rand(*shape), dtype='float32')
+
+
+    max_var = 16  #excluded
+    for shape in [(2,),
+                  (2,2),
+                  (2,2,2),
+                  (2,2,2,2),
+                  (2,2,2,2,2),  # 5d
+                  (2,2,2,2,2,2),
+#                  (2,2,2,2,2,2,2),
+#                  (2,2,2,2,2,2,2,2),
+#                  (2,2,2,1,1,1,1,2,2),  # 9d
+                  ]:
+        vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
+        for use_tan in [True, False]:
+            if use_tan:
+                vars = [tensor.tanh(x) for x in vals]
+            else:
+                vars = vals
+            for nb_var in range(1, max_var):
+                out = reduce(lambda x, y: x+y, vars[:nb_var])
+                if not isinstance(out.type, CudaNdarrayType):
+                    out = cuda.gpu_from_host(out)
+                f = pfunc([], [out], mode=mode_with_gpu)
+
+                topo = f.maker.env.toposort()
+                #print shape, nb_var, use_tan, len(topo)
+                assert (sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == len(topo) or
+                        (nb_var == 1 and use_tan == False))
+                assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 0
+
+                #let debugmode catch errors
+                f()

 def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""