提交 06b6fcb7 authored 作者: Faruk Ahmed's avatar Faruk Ahmed 提交者: Faruk Ahmed

fix conflict

updates updates update fixes
上级 a5c029dc
......@@ -751,16 +751,12 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
gpu_output = res(*new_inputs)
return [gpu_output]
elif op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(outputs)
if max_nb_inputs > 1:
while len(inputs) > max_nb_inputs:
inputs = inputs[:-max_nb_inputs] + [res(*inputs[-max_nb_inputs:])]
return res(*inputs)
return split_huge_add_or_mul(outputs[0].owner, res).outputs
else:
return res
def split_huge_add_or_mul(node):
def split_huge_add_or_mul(node, op=None):
"""
For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code.
......@@ -771,16 +767,19 @@ def split_huge_add_or_mul(node):
that can generate op with too much input and it check for that.
"""
if op is None:
op = node.op
if node.op.scalar_op in (scalar.add, scalar.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
if max_nb_inputs <= 1 and len(node.inputs) > 1:
return False
while len(node.inputs) > max_nb_inputs:
inner_op = []
for i in range(0, len(node.inputs), max_nb_inputs):
inner_op.append(node.op(*node.inputs[i: i + max_nb_inputs]))
node = node.op(*inner_op).owner
return node
else:
while len(node.inputs) > max_nb_inputs:
inner_op = []
for i in range(0, len(node.inputs), max_nb_inputs):
inner_op.append(op(*node.inputs[i: i + max_nb_inputs]))
node = node.op(*inner_op).owner
return op(*node.inputs).owner
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise,
......
......@@ -15,7 +15,8 @@ from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..basic_ops import (
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuFromHost, host_from_gpu)
from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..elemwise import (GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise,
Elemwise, max_inputs_to_GpuElemwise)
from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available
......@@ -450,14 +451,15 @@ def test_local_gpu_elemwise():
def test_many_arg_elemwise():
# this test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu
# This test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu.
rng = np.random.RandomState([1, 2, 3])
for num_args in [75]:
for num_args in [32, 64, 128]:
for op_to_test in [theano.tensor.add, theano.tensor.mul]:
for nb_dim in [2, 3, 4, 5, 7]:
shapes = [rng.randint(1, 5) for i in range(nb_dim)]
for nb_dim in [2, 4, 8]:
shapes = [rng.randint(1, int(32 / nb_dim)) for i in range(nb_dim)]
args = [np.cast['float32'](rng.randn(*shapes))
for arg in range(0, num_args)]
......@@ -467,30 +469,20 @@ def test_many_arg_elemwise():
outputs = []
for mode in [mode_with_gpu, mode_without_gpu]:
# test the optijmization local_gpu_elemwise_0
# test the optimization local_gpua_elemwise
f = theano.function(
symb_args, op_to_test(*symb_args),
mode=mode.excluding("local_gpu_elemwise_1"))
symb_args, op_to_test(*symb_args))
outputs.append(f(*args))
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise)
for node in f.maker.fgraph.apply_nodes])
# test the optijmization local_gpu_elemwise_1
f = theano.function(
symb_args,
GpuFromHost(test_ctx_name)(op_to_test(*symb_args)),
mode=mode.excluding("local_gpu_elemwise_0"))
out = f(*args)
# assert that the test was done on the gpu.
if mode is mode_with_gpu:
assert any([isinstance(node.op, GpuElemwise)
for node in f.maker.fgraph.apply_nodes])
utt.assert_allclose(out, outputs[-1])
nodelst = [node for node in f.maker.fgraph.apply_nodes]
assert any(isinstance(node.op, GpuElemwise)
for node in nodelst)
assert not any(isinstance(node.op, Elemwise)
for node in nodelst
if not isinstance(node.op, GpuElemwise))
results_gpu, results_cpu = outputs
utt.assert_allclose(results_gpu, results_cpu)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论