提交 bd34bddf authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge pull request #544 from nouiz/fix_composite_new_inputs

Fix composite new inputs
...@@ -29,7 +29,8 @@ New Features ...@@ -29,7 +29,8 @@ New Features
(Frederic B.) (Frederic B.)
* debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.) * debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
* If you use Enthought Python Distribution (EPD) now we use its blas * If you use Enthought Python Distribution (EPD) now we use its blas
implementation by default. (Frederic B.) implementation by default (Tested Linux, Windows)
(Frederic B., Simon McGregor)
Sparse Sandbox graduate Sparse Sandbox graduate
* Remove0 op: it remove store element with value 0. (Frederic B.) * Remove0 op: it remove store element with value 0. (Frederic B.)
...@@ -54,6 +55,11 @@ Crash Fix ...@@ -54,6 +55,11 @@ Crash Fix
* Optimization print useless error when scipy is not available. (Frederic B.) * Optimization print useless error when scipy is not available. (Frederic B.)
* Gpu conv crash/slowdown on newer hardware? (James B.) * Gpu conv crash/slowdown on newer hardware? (James B.)
* Better error handling in gpu conv (Frederic B.) * Better error handling in gpu conv (Frederic B.)
* GPU optimization that move element-wise op to the gpu. It happen in
a particular execution order of this optimization and the
element-wise fusion optimization when upcasting some inputs to
float32 (to compute them on the gpu).
(Frederic B., reported by Sander Dieleman)
============= =============
Release Notes Release Notes
......
...@@ -20,7 +20,7 @@ from debugmode import DebugMode ...@@ -20,7 +20,7 @@ from debugmode import DebugMode
from profilemode import ProfileMode from profilemode import ProfileMode
from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable
from theano.compile.pfunc import pfunc, Param from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
from function import function from function import function
...@@ -42,7 +42,7 @@ def rebuild_collect_shared(outputs, ...@@ -42,7 +42,7 @@ def rebuild_collect_shared(outputs,
inputs of the computational graph (or None) inputs of the computational graph (or None)
:type replace: dict :type replace: dict
:param replace: dictionary describing which subgraphs should be :param replace: dictionary describing which subgraphs should be
replaced by what replaced by what. orig_value => new_value
:type updates: dict :type updates: dict
:param updates: dictionary describing updates expressions for shared :param updates: dictionary describing updates expressions for shared
......
...@@ -165,6 +165,7 @@ class Apply(utils.object2): ...@@ -165,6 +165,7 @@ class Apply(utils.object2):
:returns: an Apply instance with the same op but different outputs. :returns: an Apply instance with the same op but different outputs.
""" """
assert isinstance(inputs, (list, tuple))
remake_node = False remake_node = False
new_inputs = inputs[:] new_inputs = inputs[:]
for i, (curr, new) in enumerate(zip(self.inputs, new_inputs)): for i, (curr, new) in enumerate(zip(self.inputs, new_inputs)):
......
...@@ -241,6 +241,43 @@ def test_huge_elemwise_fusion(): ...@@ -241,6 +241,43 @@ def test_huge_elemwise_fusion():
f() f()
def test_local_gpu_elemwise_0():
"""
Test the test_local_gpu_elemwise_0 when there is dtype upcastable
to float32
"""
a = tensor.bmatrix()
b = tensor.fmatrix()
c = tensor.fmatrix()
a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
# Due to order of optimization, this the composite is created when all
# the op are on the gpu.
f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
#theano.printing.debugprint(f)
topo = f.maker.env.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
f(a_v, b_v, c_v)
# Not test with the composite already on the cpu before we move it
# to the gpu
a_s = theano.scalar.int8()
b_s = theano.scalar.float32()
c_s = theano.scalar.float32()
out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
out_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
#theano.printing.debugprint(f)
topo = f.maker.env.toposort()
assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
f(a_v, b_v, c_v)
def test_elemwise_fusion(): def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly""" """ Test the the GpuElemwise fusion work correctly"""
shape = (3,4) shape = (3,4)
......
...@@ -2371,6 +2371,19 @@ class Composite(ScalarOp): ...@@ -2371,6 +2371,19 @@ class Composite(ScalarOp):
% (self.inputs_type, tuple(input_types))) % (self.inputs_type, tuple(input_types)))
return self.outputs_type return self.outputs_type
def make_node(self, *inputs):
if (tuple([i.type for i in self.inputs]) ==
tuple([i.type for i in inputs])):
return super(Composite, self).make_node(*inputs)
else:
# Make a new op with the right input type.
res = theano.compile.rebuild_collect_shared(
self.outputs,
replace=dict(zip(self.inputs, inputs)),
rebuild_strict=False)
node = Composite(inputs, res[1]).make_node(*inputs)
return node
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
for storage, impl in zip(output_storage, self._impls): for storage, impl in zip(output_storage, self._impls):
storage[0] = impl(inputs) storage[0] = impl(inputs)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论