Merge pull request #1043 from goodfeli/fix_incsubtensor_opt

fix bug in gpu inc subtensor optimization and add a unit test

Merge pull request #1043 from goodfeli/fix_incsubtensor_opt
e6e6a433 · nouiz · 93f3b868 · 8f0a7890 · e6e6a433 · e6e6a433
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -822,6 +822,9 @@ def local_gpu_incsubtensor(node):
                    gpu_from_host(x),
                    gpu_from_host(y),
                    *coords)]
+    # Incrementing a float32 x results in a float32
+    # output even if y is float64, so we can downcast
+    # y to put it on GPU
    if type(node.op) == tensor.IncSubtensor and \
       node.inputs[0].dtype == "float32":
        x, y = node.inputs[0:2]
@@ -838,6 +841,8 @@ def local_gpu_incsubtensor(node):
            go_gpu = True
            gpu_y, = y.owner.inputs
        else:
+            if y.dtype != 'float32':
+                y = tensor.cast(y, 'float32')
            gpu_y = gpu_from_host(y)
        if go_gpu:
            return [host_from_gpu(GpuIncSubtensor(

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -377,6 +377,29 @@ class TestIfElse(theano.tests.test_ifelse.test_ifelse):
    def get_ifelse(self, n):
        return theano.ifelse.IfElse(n, gpu=True, as_view=True)

+def test_incsubtensor_mixed():
+
+    # This catches a bug that occurred when incrementing
+    # a float32 tensor by a float64 tensor.
+    # The result is defined to be float32, so it is OK
+    # to downcast the float64 increment in order to
+    # transfer it to the GPU.
+    # The bug was that the optimization called GpuFromHost
+    # without casting first, causing the optimization to
+    # fail.
+    X = tensor.fmatrix()
+    Y = tensor.dmatrix()
+    Z = tensor.inc_subtensor(X[0:1,0:1],Y)
+    f = theano.function([X,Y], Z, mode=mode_with_gpu)
+    packed, = f.maker.fgraph.inputs[1].clients
+    client, idx = packed
+    print client
+    assert isinstance(client.op, tensor.Elemwise)
+    assert isinstance(client.op.scalar_op, theano.scalar.Cast)
+    packed ,= client.outputs[0].clients
+    client, idx = packed
+    assert isinstance(client.op, cuda.GpuFromHost)
+
 if __name__ == '__main__':
    test_gpualloc()
    test_opt_gpujoin_onlyajoin()