crash fix:GPU optimization that move element-wise op to the gpu.

It happen in a particular execution order of this optimization and the element-wise fusion optimization when upcasting some inputs to float32 (to compute them on the gpu). (Frederic B., reported by Sander Dieleman)

crash fix:GPU optimization that move element-wise op to the gpu.
648438e7 · Frederic · 65eeaaec · 648438e7 · 648438e7 · 648438e7
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -54,6 +54,11 @@ Crash Fix
 * Optimization print useless error when scipy is not available. (Frederic B.)
 * Gpu conv crash/slowdown on newer hardware? (James B.)
 * Better error handling in gpu conv (Frederic B.)
+ * GPU optimization that move element-wise op to the gpu. It happen in
+   a particular execution order of this optimization and the
+   element-wise fusion optimization when upcasting some inputs to
+   float32 (to compute them on the gpu).
+   (Frederic B., reported by Sander Dieleman)
 =============
 Release Notes

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -241,6 +241,43 @@ def test_huge_elemwise_fusion():
                f()
+def test_local_gpu_elemwise_0():
+    """
+    Test the test_local_gpu_elemwise_0 when there is dtype upcastable
+    to float32
+    """
+    a = tensor.bmatrix()
+    b = tensor.fmatrix()
+    c = tensor.fmatrix()
+    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
+    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+    # Due to order of optimization, this the composite is created when all
+    # the op are on the gpu.
+    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
+    #theano.printing.debugprint(f)
+    topo = f.maker.env.toposort()
+    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+    f(a_v, b_v, c_v)
+    # Not test with the composite already on the cpu before we move it
+    # to the gpu
+    a_s = theano.scalar.int8()
+    b_s = theano.scalar.float32()
+    c_s = theano.scalar.float32()
+    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
+    out_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
+    #theano.printing.debugprint(f)
+    topo = f.maker.env.toposort()
+    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+    f(a_v, b_v, c_v)
 def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""
    shape = (3,4)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -2371,6 +2371,19 @@ class Composite(ScalarOp):
                            % (self.inputs_type, tuple(input_types)))
        return self.outputs_type
+    def make_node(self, *inputs):
+        if (tuple([i.type for i in self.inputs]) ==
+            tuple([i.type for i in inputs])):
+            return super(Composite, self).make_node(*inputs)
+        else:
+            # Make a new op with the right input type.
+            res = theano.compile.rebuild_collect_shared(
+                self.outputs,
+                replace=dict(zip(self.inputs, inputs)),
+                rebuild_strict=False)
+            node = Composite(inputs, res[1]).make_node(*inputs)
+            return node
    def perform(self, node, inputs, output_storage):
        for storage, impl in zip(output_storage, self._impls):
            storage[0] = impl(inputs)