提交 89aac420 authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier 提交者: GitHub

Merge pull request #5852 from Faruk-Ahmed/split_elemwise_addmul

Adapt local_gpu_elemwise optimization of new gpuarray back-end to avoid number of inputs overflow with Elemwise<add,mul>. The current optimization was already splitting the input, but it was not using the method split_huge_add_or_mul because of new gpuarray lifter signature (see comment https://github.com/Theano/Theano/pull/5852#discussion_r114145523) The unit test for large number of inputs was invalid because it was testing the old back-end (theano.sandbox.cuda). It is now adapted to gpuarray lifter optimization function. The number of settings tested is reduced to lower the computation time while still making sure we test at least one case with no number of inputs overflow and at least one case with number of inputs overflow. split_huge_add_or_mul() is made more general so it can be used if any case like Elemwise<add,mul> occurs elsewhere.
...@@ -753,36 +753,46 @@ def local_gpua_elemwise(op, context_name, inputs, outputs): ...@@ -753,36 +753,46 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
gpu_output = res(*new_inputs) gpu_output = res(*new_inputs)
return [gpu_output] return [gpu_output]
elif op.scalar_op in (scalar.add, scalar.mul): elif op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(outputs) try:
if max_nb_inputs > 1: return [split_inputs(inputs, max_inputs_to_GpuElemwise(outputs), res)]
while len(inputs) > max_nb_inputs: except ValueError:
inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])] return False
return res(*inputs)
else: else:
return res return res
def split_huge_add_or_mul(node): def split_inputs(inputs, max_nb_inputs, op):
""" """
For add and mul, it can happen that we have too much input For some ops like add and mul, a large number of inputs can make nvcc fail
That will make nvcc fail compilation of our current code. compilation of our current code. We don't want node in the graph that can't
We don't want node in the graph that can't execute execute as this break DebugMode.
as this break DebugMode.
This should not happen for other GpuElemwise as their is only the fusion This should not happen for other GpuElemwise as their is only the fusion
that can generate op with too much input and it check for that. that can generate op with too much input and it check for that.
Parameters
----------
inputs: List of theano variables.
List of inputs to node.
max_nb_inputs: int
Maximum number of inputs the node can handle without
compilation fail.
op : Theano operator instance.
Operator that should be used to rebuild the computation graph with smaller
number of inputs per node.
""" """
if node.op.scalar_op in (scalar.add, scalar.mul): if max_nb_inputs <= 1 and len(inputs) > 1:
max_nb_inputs = max_inputs_to_GpuElemwise(node) raise ValueError("Can not split nodes because inputs' dimensionality and/or"
if max_nb_inputs <= 1 and len(node.inputs) > 1: " number of outputs is too large")
return False
while len(node.inputs) > max_nb_inputs: while len(inputs) > max_nb_inputs:
inner_op = [] inner_ops = []
for i in range(0, len(node.inputs), max_nb_inputs): for i in range(0, len(inputs), max_nb_inputs):
inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs])) inner_ops.append(op(*inputs[i: i + max_nb_inputs]))
node = node.op(*inner_op).owner inputs = inner_ops
return node
return op(*inputs)
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise, GpuElemwise,
......
...@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context ...@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..basic_ops import ( from ..basic_ops import (
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu) GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
from ..blas import GpuGemm from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise from ..elemwise import (
GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
...@@ -460,13 +461,14 @@ def test_local_gpu_elemwise(): ...@@ -460,13 +461,14 @@ def test_local_gpu_elemwise():
def test_many_arg_elemwise(): def test_many_arg_elemwise():
# this test checks whether the + and * elemwise ops can handle # This test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu # extremely large numbers of arguments on gpu.
rng = np.random.RandomState([1, 2, 3])
for num_args in [75]: rng = np.random.RandomState([1, 2, 3])
nb_of_inputs_overflows = []
for num_args in [64]:
for op_to_test in [theano.tensor.add, theano.tensor.mul]: for op_to_test in [theano.tensor.add, theano.tensor.mul]:
for nb_dim in [2, 3, 4, 5, 7]: for nb_dim in [2, 8]:
shapes = [rng.randint(1, 5) for i in range(nb_dim)] shapes = [rng.randint(1, 5) for i in range(nb_dim)]
args = [np.cast['float32'](rng.randn(*shapes)) args = [np.cast['float32'](rng.randn(*shapes))
for arg in range(0, num_args)] for arg in range(0, num_args)]
...@@ -477,32 +479,30 @@ def test_many_arg_elemwise(): ...@@ -477,32 +479,30 @@ def test_many_arg_elemwise():
outputs = [] outputs = []
for mode in [mode_with_gpu, mode_without_gpu]: for mode in [mode_with_gpu, mode_without_gpu]:
# test the optijmization local_gpu_elemwise_0 # test the optimization local_gpua_elemwise
f = theano.function( output = op_to_test(*symb_args)
symb_args, op_to_test(*symb_args), f = theano.function(symb_args, output, mode=mode)
mode=mode.excluding("local_gpu_elemwise_1"))
outputs.append(f(*args)) outputs.append(f(*args))
# assert that the test was done on the gpu. # assert that the test was done on the gpu.
if mode is mode_with_gpu: if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise) nb_of_inputs_overflows.append(
for node in f.maker.fgraph.apply_nodes]) max_inputs_to_GpuElemwise(output.owner) - num_args)
nodelst = [node for node in f.maker.fgraph.apply_nodes]
# test the optijmization local_gpu_elemwise_1 assert any(isinstance(node.op, GpuElemwise)
f = theano.function( for node in nodelst)
symb_args, assert not any(isinstance(node.op, Elemwise)
GpuFromHost(test_ctx_name)(op_to_test(*symb_args)), for node in nodelst
mode=mode.excluding("local_gpu_elemwise_0")) if not isinstance(node.op, GpuElemwise))
out = f(*args)
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise)
for node in f.maker.fgraph.apply_nodes])
utt.assert_allclose(out, outputs[-1])
results_gpu, results_cpu = outputs results_gpu, results_cpu = outputs
utt.assert_allclose(results_gpu, results_cpu) utt.assert_allclose(results_gpu, results_cpu)
# Make sure we test at least one case with no number of inputs overflow
assert any(overflow >= 0 for overflow in nb_of_inputs_overflows)
# Make sure we test at least one case with number of inputs overflow
assert any(overflow < 0 for overflow in nb_of_inputs_overflows)
def test_not_useless_scalar_gpuelemwise(): def test_not_useless_scalar_gpuelemwise():
# We don't want to move elemwise on scalar on the GPU when the # We don't want to move elemwise on scalar on the GPU when the
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论