提交 06b6fcb7 authored 作者: Faruk Ahmed's avatar Faruk Ahmed 提交者: Faruk Ahmed

fix conflict

updates updates update fixes
上级 a5c029dc
...@@ -751,16 +751,12 @@ def local_gpua_elemwise(op, context_name, inputs, outputs): ...@@ -751,16 +751,12 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
gpu_output = res(*new_inputs) gpu_output = res(*new_inputs)
return [gpu_output] return [gpu_output]
elif op.scalar_op in (scalar.add, scalar.mul): elif op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(outputs) return split_huge_add_or_mul(outputs[0].owner, res).outputs
if max_nb_inputs > 1:
while len(inputs) > max_nb_inputs:
inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
return res(*inputs)
else: else:
return res return res
def split_huge_add_or_mul(node): def split_huge_add_or_mul(node, op=None):
""" """
For add and mul, it can happen that we have too much input For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code. That will make nvcc fail compilation of our current code.
...@@ -771,16 +767,19 @@ def split_huge_add_or_mul(node): ...@@ -771,16 +767,19 @@ def split_huge_add_or_mul(node):
that can generate op with too much input and it check for that. that can generate op with too much input and it check for that.
""" """
if op is None:
op = node.op
if node.op.scalar_op in (scalar.add, scalar.mul): if node.op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node) max_nb_inputs = max_inputs_to_GpuElemwise(node)
if max_nb_inputs <= 1 and len(node.inputs) > 1: if max_nb_inputs <= 1 and len(node.inputs) > 1:
return False return False
while len(node.inputs) > max_nb_inputs: else:
inner_op = [] while len(node.inputs) > max_nb_inputs:
for i in range(0, len(node.inputs), max_nb_inputs): inner_op = []
inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs])) for i in range(0, len(node.inputs), max_nb_inputs):
node = node.op(*inner_op).owner inner_op.append(op(*node.inputs[i: i + max_nb_inputs]))
return node node = node.op(*inner_op).owner
return op(*node.inputs).owner
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise, GpuElemwise,
......
...@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context ...@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..basic_ops import ( from ..basic_ops import (
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu) GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
from ..blas import GpuGemm from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise from ..elemwise import (GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise,
Elemwise, max_inputs_to_GpuElemwise)
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available from ..linalg import GpuCusolverSolve, cusolver_available
...@@ -450,14 +451,15 @@ def test_local_gpu_elemwise(): ...@@ -450,14 +451,15 @@ def test_local_gpu_elemwise():
def test_many_arg_elemwise(): def test_many_arg_elemwise():
# this test checks whether the + and * elemwise ops can handle # This test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu # extremely large numbers of arguments on gpu.
rng = np.random.RandomState([1, 2, 3]) rng = np.random.RandomState([1, 2, 3])
for num_args in [75]: for num_args in [32, 64, 128]:
for op_to_test in [theano.tensor.add, theano.tensor.mul]: for op_to_test in [theano.tensor.add, theano.tensor.mul]:
for nb_dim in [2, 3, 4, 5, 7]: for nb_dim in [2, 4, 8]:
shapes = [rng.randint(1, 5) for i in range(nb_dim)] shapes = [rng.randint(1, int(32 / nb_dim)) for i in range(nb_dim)]
args = [np.cast['float32'](rng.randn(*shapes)) args = [np.cast['float32'](rng.randn(*shapes))
for arg in range(0, num_args)] for arg in range(0, num_args)]
...@@ -467,30 +469,20 @@ def test_many_arg_elemwise(): ...@@ -467,30 +469,20 @@ def test_many_arg_elemwise():
outputs = [] outputs = []
for mode in [mode_with_gpu, mode_without_gpu]: for mode in [mode_with_gpu, mode_without_gpu]:
# test the optijmization local_gpu_elemwise_0 # test the optimization local_gpua_elemwise
f = theano.function( f = theano.function(
symb_args, op_to_test(*symb_args), symb_args, op_to_test(*symb_args))
mode=mode.excluding("local_gpu_elemwise_1"))
outputs.append(f(*args)) outputs.append(f(*args))
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise)
for node in f.maker.fgraph.apply_nodes])
# test the optijmization local_gpu_elemwise_1
f = theano.function(
symb_args,
GpuFromHost(test_ctx_name)(op_to_test(*symb_args)),
mode=mode.excluding("local_gpu_elemwise_0"))
out = f(*args)
# assert that the test was done on the gpu. # assert that the test was done on the gpu.
if mode is mode_with_gpu: if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise) nodelst = [node for node in f.maker.fgraph.apply_nodes]
for node in f.maker.fgraph.apply_nodes]) assert any(isinstance(node.op, GpuElemwise)
utt.assert_allclose(out, outputs[-1]) for node in nodelst)
assert not any(isinstance(node.op, Elemwise)
for node in nodelst
if not isinstance(node.op, GpuElemwise))
results_gpu, results_cpu = outputs results_gpu, results_cpu = outputs
utt.assert_allclose(results_gpu, results_cpu) utt.assert_allclose(results_gpu, results_cpu)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论