提交 42ffa21a authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Update param size limit, correct size computation

上级 b0886305
...@@ -42,24 +42,44 @@ def get_scal(dt): ...@@ -42,24 +42,44 @@ def get_scal(dt):
def max_inputs_to_GpuElemwise(node_or_outputs): def max_inputs_to_GpuElemwise(node_or_outputs):
"""
Compute the maximum number of inputs that fit in a kernel call.
"""
if isinstance(node_or_outputs, Apply): if isinstance(node_or_outputs, Apply):
outputs = node_or_outputs.outputs outputs = node_or_outputs.outputs
else: else:
outputs = node_or_outputs outputs = node_or_outputs
n_out = len(outputs)
ndim = outputs[0].type.ndim
ptr_size = 8 ptr_size = 8
# We compile code that would do indexing in int64 # Even with call32, the interface does not change, and shapes,
# strides, and offset are passed as 64-bits (8 bytes)
int_size = 8 int_size = 8
# we take the limit from CUDA for now # we take the limit from CUDA for now
argument_limit = 232 nb_bytes_total = 4096
ndim = outputs[0].type.ndim
# number of elements and shape # Regardless of the number of arguments, we have:
size_param_mandatory = (int_size * (ndim + 1)) + \ # - The total number of elements (int)
(ptr_size + int_size * ndim) * len(outputs) # - The shape (int) on each dimension
fixed_size = int_size + int_size * ndim
# Each argument (input or output) has:
# - 1 pointer (ptr)
# - 1 offset (int)
# - 1 stride (int) per dimension
# Even if the tensor ends up being contiguous, code for the
# non-contiguous case still needs to be generated.
param_size = ptr_size + int_size + int_size * ndim
# Remaining for inputs
nb_bytes_for_inputs = nb_bytes_total - fixed_size - param_size * n_out
# Maximum number of inputs
max_nb_inputs = nb_bytes_for_inputs // param_size
nb_bytes_avail = argument_limit - size_param_mandatory
nb_bytes_per_input = ptr_size + ndim * int_size
max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
return max_nb_inputs return max_nb_inputs
......
...@@ -477,14 +477,13 @@ def test_Gpujoin_inplace(): ...@@ -477,14 +477,13 @@ def test_Gpujoin_inplace():
def test_many_arg_elemwise(): def test_many_arg_elemwise():
"""this test checks whether the + and * elemwise ops can handle extremely large numbers of # this test checks whether the + and * elemwise ops can handle
arguments on gpu # extremely large numbers of arguments on gpu
i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """
rng = np.random.RandomState([1, 2, 3]) rng = np.random.RandomState([1, 2, 3])
for num_args in [25]: for num_args in [75]:
for op_to_test in [theano.tensor.add, theano.tensor.mul]: for op_to_test in [theano.tensor.add, theano.tensor.mul]:
for nb_dim in [2, 3, 4, 5]: for nb_dim in [2, 3, 4, 5, 7]:
shapes = [rng.randint(1, 5) for i in range(nb_dim)] shapes = [rng.randint(1, 5) for i in range(nb_dim)]
args = [np.cast['float32'](rng.randn(*shapes)) args = [np.cast['float32'](rng.randn(*shapes))
for arg in range(0, num_args)] for arg in range(0, num_args)]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论