Update param size limit, correct size computation

42ffa21a · Pascal Lamblin · b0886305 · 42ffa21a · 42ffa21a
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -42,24 +42,44 @@ def get_scal(dt):
 def max_inputs_to_GpuElemwise(node_or_outputs):
+    """
+    Compute the maximum number of inputs that fit in a kernel call.
+    """
    if isinstance(node_or_outputs, Apply):
        outputs = node_or_outputs.outputs
    else:
        outputs = node_or_outputs
+    n_out = len(outputs)
+    ndim = outputs[0].type.ndim
    ptr_size = 8
-    # We compile code that would do indexing in int64
+    # Even with call32, the interface does not change, and shapes,
+    # strides, and offset are passed as 64-bits (8 bytes)
    int_size = 8
    # we take the limit from CUDA for now
-    argument_limit = 232
+    nb_bytes_total = 4096
-    ndim = outputs[0].type.ndim
-    # number of elements and shape
+    # Regardless of the number of arguments, we have:
-    size_param_mandatory = (int_size * (ndim + 1)) + \
+    # - The total number of elements (int)
-        (ptr_size + int_size * ndim) * len(outputs)
+    # - The shape (int) on each dimension
+    fixed_size = int_size + int_size * ndim
+    # Each argument (input or output) has:
+    # - 1 pointer (ptr)
+    # - 1 offset (int)
+    # - 1 stride (int) per dimension
+    # Even if the tensor ends up being contiguous, code for the
+    # non-contiguous case still needs to be generated.
+    param_size = ptr_size + int_size + int_size * ndim
+    # Remaining for inputs
+    nb_bytes_for_inputs = nb_bytes_total - fixed_size - param_size * n_out
+    # Maximum number of inputs
+    max_nb_inputs = nb_bytes_for_inputs // param_size
-    nb_bytes_avail = argument_limit - size_param_mandatory
-    nb_bytes_per_input = ptr_size + ndim * int_size
-    max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
    return max_nb_inputs

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -477,14 +477,13 @@ def test_Gpujoin_inplace():
 def test_many_arg_elemwise():
-    """this test checks whether the + and * elemwise ops can handle extremely large numbers of
+    # this test checks whether the + and * elemwise ops can handle
-    arguments on gpu
+    # extremely large numbers of arguments on gpu
-    i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """
    rng = np.random.RandomState([1, 2, 3])
-    for num_args in [25]:
+    for num_args in [75]:
        for op_to_test in [theano.tensor.add, theano.tensor.mul]:
-            for nb_dim in [2, 3, 4, 5]:
+            for nb_dim in [2, 3, 4, 5, 7]:
                shapes = [rng.randint(1, 5) for i in range(nb_dim)]
                args = [np.cast['float32'](rng.randn(*shapes))
                        for arg in range(0, num_args)]