提交 41a8e89b authored 作者: Frederic's avatar Frederic 提交者: Arnaud Bergeron

Make GpuElemwise work with multiple output, (new back-end raise an error)

上级 2595fea0
...@@ -66,7 +66,7 @@ class NaiveAlgo(object): ...@@ -66,7 +66,7 @@ class NaiveAlgo(object):
def cache_version(self): def cache_version(self):
ver = self.scalar_op.c_code_cache_version() ver = self.scalar_op.c_code_cache_version()
if ver: if ver:
return (17, self.verbose, self.sync, ver) return (18, self.verbose, self.sync, ver)
else: else:
return ver return ver
...@@ -142,6 +142,8 @@ class NaiveAlgo(object): ...@@ -142,6 +142,8 @@ class NaiveAlgo(object):
# perform the scalar operation on the input and output references # perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code?? # TODO: What if the scalar_op needs support_code??
for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code( task_code = self.scalar_op.c_code(
Apply(self.scalar_op, Apply(self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable() [scalar.Scalar(dtype=input.type.dtype).make_variable()
...@@ -150,9 +152,11 @@ class NaiveAlgo(object): ...@@ -150,9 +152,11 @@ class NaiveAlgo(object):
for output in node.outputs]), for output in node.outputs]),
nodename + '_scalar_', nodename + '_scalar_',
get_str_list_logical_scalar(node), get_str_list_logical_scalar(node),
['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)], ['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;')) # TODO: set a failure code somehow!!! sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio) print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs):
print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio) print(" }", file=sio)
#indent = " "*(4*d+7) #indent = " "*(4*d+7)
...@@ -477,6 +481,8 @@ class NaiveAlgo(object): ...@@ -477,6 +481,8 @@ class NaiveAlgo(object):
print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio) print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio)
# perform the scalar operation on the input and output references # perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code?? # TODO: What if the scalar_op needs support_code??
for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code( task_code = self.scalar_op.c_code(
Apply(self.scalar_op, Apply(self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable() [scalar.Scalar(dtype=input.type.dtype).make_variable()
...@@ -486,9 +492,11 @@ class NaiveAlgo(object): ...@@ -486,9 +492,11 @@ class NaiveAlgo(object):
, nodename + '_scalar_' , nodename + '_scalar_'
#, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)] #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
, get_str_list_logical_scalar(node, data_str='i%i_data[i]') , get_str_list_logical_scalar(node, data_str='i%i_data[i]')
, ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)] , ['o%i_i'%ipos for ipos, i in enumerate(node.outputs)]
, sub=dict(fail='return;')) # TODO: set a failure code somehow!!! , sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio) print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs):
print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio) print(" }", file=sio)
print("}", file=sio) print("}", file=sio)
......
...@@ -618,16 +618,40 @@ def test_local_gpu_elemwise_0(): ...@@ -618,16 +618,40 @@ def test_local_gpu_elemwise_0():
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
# Test multiple output
a_s = theano.scalar.float32()
a = tensor.fmatrix()
from theano.scalar.basic import identity
out_s = theano.scalar.Composite([a_s, b_s, c_s],
[identity(a_s), identity(c_s), identity(b_s)])
outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v)
utt.assert_allclose(out[1], c_v)
utt.assert_allclose(out[2], b_v)
# Test multiple output # Test multiple output
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s]) out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s])
outs_op = tensor.Elemwise(out_s) outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
out = f(a_v, b_v, c_v) out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[0], a_v + b_v)
utt.assert_allclose(out[1], a_v + c_v) utt.assert_allclose(out[1], a_v * c_v)
# Test non-contiguous input
c = cuda.shared_constructor(c_v)
f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
mode=mode_with_gpu)
out = f(a_v, b_v)
utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def test_elemwise_fusion(): def test_elemwise_fusion():
......
...@@ -72,6 +72,8 @@ class GpuElemwise(HideC, Elemwise): ...@@ -72,6 +72,8 @@ class GpuElemwise(HideC, Elemwise):
res = Elemwise.make_node(self, *inputs) res = Elemwise.make_node(self, *inputs)
outputs = [GpuArrayType(broadcastable=o.type.broadcastable, outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
dtype=o.type.dtype)() for o in res.outputs] dtype=o.type.dtype)() for o in res.outputs]
if len(outputs) > 1:
raise NotImplementedError()
inputs = [as_gpuarray_variable(i) for i in inputs] inputs = [as_gpuarray_variable(i) for i in inputs]
node = Apply(self, inputs, outputs) node = Apply(self, inputs, outputs)
......
...@@ -292,6 +292,23 @@ def test_local_gpu_elemwise(): ...@@ -292,6 +292,23 @@ def test_local_gpu_elemwise():
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
return # Not yet implemeted
# Test multiple output
a_s = theano.scalar.float32()
a = tensor.fmatrix()
from theano.scalar.basic import identity
out_s = theano.scalar.Composite([a_s, b_s, c_s],
[identity(a_s), identity(c_s), identity(b_s)])
outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v)
utt.assert_allclose(out[1], c_v)
utt.assert_allclose(out[2], b_v)
# Test multiple output # Test multiple output
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s]) out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
outs_op = tensor.Elemwise(out_s) outs_op = tensor.Elemwise(out_s)
...@@ -302,4 +319,12 @@ def test_local_gpu_elemwise(): ...@@ -302,4 +319,12 @@ def test_local_gpu_elemwise():
assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
out = f(a_v, b_v, c_v) out = f(a_v, b_v, c_v)
utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[0], a_v + b_v)
utt.assert_allclose(out[1], a_v + c_v) utt.assert_allclose(out[1], a_v * c_v)
# Test non-contiguous input
c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32'))
f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
mode=mode_with_gpu)
out = f(a_v, b_v)
utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论