提交 eba7d742 authored 作者: Frederic Bastien's avatar Frederic Bastien

make the split of too huge GpuElemwise check about the number of dimensions of…

make the split of too huge GpuElemwise check about the number of dimensions of the inputs. test this too.
上级 db238d77
...@@ -774,13 +774,25 @@ def local_gpu_huge_add_or_mul(node): ...@@ -774,13 +774,25 @@ def local_gpu_huge_add_or_mul(node):
The CUDA c compiler limits the number of arguments to 256 bytes' worth or something. The CUDA c compiler limits the number of arguments to 256 bytes' worth or something.
""" """
if isinstance(node.op, GpuElemwise) and node.op.scalar_op in (scal.add, scal.mul): if isinstance(node.op, GpuElemwise) and node.op.scalar_op in (scal.add, scal.mul):
if len(node.inputs)>10: #TODO: detect the size of gpu pointeur and c int.
# TODO: look up how arguments are passed to the GpuElemwise function int_size = 8
# and figure out how many arguments can fit in 256 bytes. ptr_size = 8
# this will depend on the number of dimensions in each argument.
# The current heuristic to chop at 10 prevents crashing in the argument_limit = 256 # 16 bytes are used for block and thread coords etc.
# pylearn/algorithms/tests/test_mcRBM feature extractor. size_param_mandatory = int_size #for numels
return [node.op( size_param_mandatory += int_size * node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
node.op(*node.inputs[:10]), size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
node.op(*node.inputs[10:]))] nb_bytes_avail = argument_limit-size_param_mandatory
nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
#print "max_nb_inputs",max_nb_inputs
if len(node.inputs)>max_nb_inputs:
inner_op = []
#we split the input in one call to the optimization
#if this generate too much split, another call to this optimization
#will fix that.
for i in range(0,len(node.inputs),max_nb_inputs):
inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs]))
return [node.op(*inner_op)]
...@@ -759,27 +759,25 @@ def test_many_arg_elemwise(): ...@@ -759,27 +759,25 @@ def test_many_arg_elemwise():
rng = numpy.random.RandomState( [1,2,3]) rng = numpy.random.RandomState( [1,2,3])
for num_args in [25]: for num_args in [25]:
rows = rng.randint(1,5)
cols = rng.randint(1,5)
for op_to_test in [ theano.tensor.add, theano.tensor.mul ]: for op_to_test in [ theano.tensor.add, theano.tensor.mul ]:
args = [ numpy.cast['float32'](rng.randn(rows,cols)) for arg in xrange(0,num_args) ] for nb_dim in [2,3,4,5]:
symb_args = [ theano.tensor.fmatrix() for arg in xrange(0,num_args) ] shapes = [rng.randint(1,5) for i in range(nb_dim)]
args = [ numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0,num_args) ]
symb_args = [ theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0,num_args) ]
outputs = [] outputs = []
for mode in [ mode_with_gpu, mode_without_gpu ]: for mode in [ mode_with_gpu, mode_without_gpu ]:
f = theano.function( symb_args, op_to_test(*symb_args), mode = mode ) f = theano.function( symb_args, op_to_test(*symb_args), mode = mode )
#theano.printing.debugprint(f) outputs.append( f( * args) )
outputs.append( f( * args) ) #assert that the test was done on the gpu.
#assert that the test was done on the gpu. if mode is mode_with_gpu:
if mode is mode_with_gpu: assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes])
results_gpu, results_cpu = outputs results_gpu, results_cpu = outputs
assert numpy.allclose(results_gpu, results_cpu) assert numpy.allclose(results_gpu, results_cpu)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论