Merge pull request #544 from nouiz/fix_composite_new_inputs

Fix composite new inputs

Merge pull request #544 from nouiz/fix_composite_new_inputs
bd34bddf · Olivier Delalleau · e9264ec6 · 68395f5a · bd34bddf · bd34bddf
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -29,7 +29,8 @@ New Features
   (Frederic B.)
 * debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
 * If you use Enthought Python Distribution (EPD) now we use its blas
-   implementation by default. (Frederic B.)
+   implementation by default (Tested Linux, Windows)
+   (Frederic B., Simon McGregor)
 Sparse Sandbox graduate
 * Remove0 op: it remove store element with value 0. (Frederic B.)
@@ -54,6 +55,11 @@ Crash Fix
 * Optimization print useless error when scipy is not available. (Frederic B.)
 * Gpu conv crash/slowdown on newer hardware? (James B.)
 * Better error handling in gpu conv (Frederic B.)
+ * GPU optimization that move element-wise op to the gpu. It happen in
+   a particular execution order of this optimization and the
+   element-wise fusion optimization when upcasting some inputs to
+   float32 (to compute them on the gpu).
+   (Frederic B., reported by Sander Dieleman)
 =============
 Release Notes

--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -20,7 +20,7 @@ from debugmode import DebugMode
 from profilemode import ProfileMode
 from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable
-from theano.compile.pfunc import pfunc, Param
+from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
 from function import function
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -42,7 +42,7 @@ def rebuild_collect_shared(outputs,
                    inputs of the computational graph (or None)
    :type replace: dict
    :param replace: dictionary describing which subgraphs should be
-                    replaced by what
+                    replaced by what. orig_value => new_value
    :type updates: dict
    :param updates: dictionary describing updates expressions for shared

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -165,6 +165,7 @@ class Apply(utils.object2):
        :returns: an Apply instance with the same op but different outputs.
        """
+        assert isinstance(inputs, (list, tuple))
        remake_node = False
        new_inputs = inputs[:]
        for i, (curr, new) in enumerate(zip(self.inputs, new_inputs)):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -241,6 +241,43 @@ def test_huge_elemwise_fusion():
                f()
+def test_local_gpu_elemwise_0():
+    """
+    Test the test_local_gpu_elemwise_0 when there is dtype upcastable
+    to float32
+    """
+    a = tensor.bmatrix()
+    b = tensor.fmatrix()
+    c = tensor.fmatrix()
+    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
+    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+    # Due to order of optimization, this the composite is created when all
+    # the op are on the gpu.
+    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
+    #theano.printing.debugprint(f)
+    topo = f.maker.env.toposort()
+    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+    f(a_v, b_v, c_v)
+    # Not test with the composite already on the cpu before we move it
+    # to the gpu
+    a_s = theano.scalar.int8()
+    b_s = theano.scalar.float32()
+    c_s = theano.scalar.float32()
+    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
+    out_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
+    #theano.printing.debugprint(f)
+    topo = f.maker.env.toposort()
+    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+    f(a_v, b_v, c_v)
 def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""
    shape = (3,4)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -2371,6 +2371,19 @@ class Composite(ScalarOp):
                            % (self.inputs_type, tuple(input_types)))
        return self.outputs_type
+    def make_node(self, *inputs):
+        if (tuple([i.type for i in self.inputs]) ==
+            tuple([i.type for i in inputs])):
+            return super(Composite, self).make_node(*inputs)
+        else:
+            # Make a new op with the right input type.
+            res = theano.compile.rebuild_collect_shared(
+                self.outputs,
+                replace=dict(zip(self.inputs, inputs)),
+                rebuild_strict=False)
+            node = Composite(inputs, res[1]).make_node(*inputs)
+            return node
    def perform(self, node, inputs, output_storage):
        for storage, impl in zip(output_storage, self._impls):
            storage[0] = impl(inputs)