make the split of too huge GpuElemwise check about the number of dimensions of…

make the split of too huge GpuElemwise check about the number of dimensions of the inputs. test this too.

make the split of too huge GpuElemwise check about the number of dimensions of…
eba7d742 · Frederic Bastien · db238d77 · eba7d742 · eba7d742
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -774,13 +774,25 @@ def local_gpu_huge_add_or_mul(node):
    The CUDA c compiler limits the number of arguments to 256 bytes' worth or something.
    """
    if isinstance(node.op, GpuElemwise) and node.op.scalar_op in (scal.add, scal.mul):
-        if len(node.inputs)>10:
+        #TODO: detect the size of gpu pointeur and c int.
-            # TODO: look up how arguments are passed to the GpuElemwise function
+        int_size = 8
-            #   and figure out how many arguments can fit in 256 bytes.
+        ptr_size = 8
-            #   this will depend on the number of dimensions in each argument.
-            #   The current heuristic to chop at 10 prevents crashing in the
+        argument_limit = 256  # 16 bytes are used for block and thread coords etc.
-            #   pylearn/algorithms/tests/test_mcRBM feature extractor.
+        size_param_mandatory = int_size #for numels
-            return [node.op(
+        size_param_mandatory += int_size *  node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
-                    node.op(*node.inputs[:10]),
+        size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
-                    node.op(*node.inputs[10:]))]
+        nb_bytes_avail = argument_limit-size_param_mandatory
+        nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
+        max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
+        #print "max_nb_inputs",max_nb_inputs
+        if len(node.inputs)>max_nb_inputs: 
+            inner_op = []
+            #we split the input in one call to the optimization
+            #if this generate too much split, another call to this optimization
+            #will fix that.
+            for i in range(0,len(node.inputs),max_nb_inputs):
+                inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs]))
+            return [node.op(*inner_op)]
--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -759,27 +759,25 @@ def test_many_arg_elemwise():
    rng = numpy.random.RandomState( [1,2,3])
    for num_args in [25]:
-        rows = rng.randint(1,5)
-        cols = rng.randint(1,5)
        for op_to_test in [ theano.tensor.add, theano.tensor.mul ]:
-            args = [ numpy.cast['float32'](rng.randn(rows,cols)) for arg in xrange(0,num_args) ]
+            for nb_dim in [2,3,4,5]:
-            symb_args = [ theano.tensor.fmatrix() for arg in xrange(0,num_args) ]            
+                shapes = [rng.randint(1,5) for i in range(nb_dim)]
+                args = [ numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0,num_args) ]
+                symb_args = [ theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0,num_args) ]            
-            outputs = []
+                outputs = []
-            for mode in [ mode_with_gpu, mode_without_gpu ]:
+                for mode in [ mode_with_gpu, mode_without_gpu ]:
-                f = theano.function( symb_args, op_to_test(*symb_args), mode = mode )
+                    f = theano.function( symb_args, op_to_test(*symb_args), mode = mode )
-                #theano.printing.debugprint(f)
+                    outputs.append( f( * args) )
-                outputs.append( f( * args) )
+                    #assert that the test was done on the gpu.
-                #assert that the test was done on the gpu.
+                    if mode is mode_with_gpu:
-                if mode is mode_with_gpu:
+                        assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
-                    assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
-            results_gpu, results_cpu = outputs
+                results_gpu, results_cpu = outputs
-            assert numpy.allclose(results_gpu, results_cpu)
+                assert numpy.allclose(results_gpu, results_cpu)