Add optimization for PdbBreakpoint under gpuarray backend

658976e8 · --global · e507ada2 · 658976e8
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -15,6 +15,7 @@ from theano.gof import (local_optimizer, EquilibriumDB,
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.nnet.conv import ConvOp
+from theano.tests.breakpoint import PdbBreakpoint
 from .type import GpuArrayType, GpuArrayConstant
 from .basic_ops import (host_from_gpu, gpu_from_host,
                        HostFromGpu, GpuFromHost,
@@ -330,6 +331,74 @@ def local_gpu_print_op(node):
    return new_op(gpu_x)
+@register_opt('fast_compile')
+@local_optimizer([PdbBreakpoint])
+def local_gpu_pdbbreakpoint_op(node):
+    if isinstance(node.op, PdbBreakpoint):
+        old_inputs = node.inputs
+        old_outputs = node.outputs
+        new_inputs = node.inputs[:1]
+        input_transfered = []
+        # Propagate the transfers to gpu through the PdbBreakpoint node
+        # while leaving the PdbBreakpoint node fully on the host
+        nb_monitored_vars = len(node.outputs)
+        for i in range(nb_monitored_vars):
+            inp = old_inputs[i+1]
+            out = old_outputs[i]
+            input_is_from_gpu = (inp.owner and
+                                 isinstance(inp.owner.op, HostFromGpu))
+            output_used = len(out.clients) > 0
+            output_goes_to_gpu = all([c[0] != "output" and
+                                      isinstance(c[0].op, GpuFromHost)
+                                      for c in out.clients])
+            if input_is_from_gpu and output_used and not output_goes_to_gpu:
+                # The op should be applied on the GPU version of the input
+                new_inputs.append(inp.owner.inputs[0])
+                input_transfered.append(True)
+            elif not input_is_from_gpu and output_used and output_goes_to_gpu:
+                # The input should be transfered to the gpu
+                new_inputs.append(gpu_from_host(inp))
+                input_transfered.append(True)
+            else:
+                # Both are on the gpu or on the host. No transfer is required.
+                new_inputs.append(inp)
+                input_transfered.append(False)
+        # Only continue the optimization if at least one input has been
+        # transfered to the gpu
+        if not any(input_transfered):
+            return False
+        # Apply the op on the new inputs
+        new_op_outputs = node.op(*new_inputs)
+        # Ensure that new_op_outputs is a list of outputs (in case the op has
+        # only one output)
+        if not isinstance(new_op_outputs, list):
+            new_op_outputs = [new_op_outputs]
+        # Propagate the transfer to the gpu through the outputs that require
+        # it
+        new_outputs = []
+        for i in range(len(new_op_outputs)):
+            if input_transfered[i]:
+                new_outputs.append(host_from_gpu(new_op_outputs[i]))
+            else:
+                new_outputs.append(new_op_outputs[i])
+        return new_outputs
+    return False
 @register_opt('fast_compile')
 @op_lifter([tensor.Join])
 def local_gpua_join(node):