提交 658976e8 authored 作者: --global's avatar --global

Add optimization for PdbBreakpoint under gpuarray backend

上级 e507ada2
...@@ -15,6 +15,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, ...@@ -15,6 +15,7 @@ from theano.gof import (local_optimizer, EquilibriumDB,
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant from .type import GpuArrayType, GpuArrayConstant
from .basic_ops import (host_from_gpu, gpu_from_host, from .basic_ops import (host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
...@@ -330,6 +331,74 @@ def local_gpu_print_op(node): ...@@ -330,6 +331,74 @@ def local_gpu_print_op(node):
return new_op(gpu_x) return new_op(gpu_x)
@register_opt('fast_compile')
@local_optimizer([PdbBreakpoint])
def local_gpu_pdbbreakpoint_op(node):
if isinstance(node.op, PdbBreakpoint):
old_inputs = node.inputs
old_outputs = node.outputs
new_inputs = node.inputs[:1]
input_transfered = []
# Propagate the transfers to gpu through the PdbBreakpoint node
# while leaving the PdbBreakpoint node fully on the host
nb_monitored_vars = len(node.outputs)
for i in range(nb_monitored_vars):
inp = old_inputs[i+1]
out = old_outputs[i]
input_is_from_gpu = (inp.owner and
isinstance(inp.owner.op, HostFromGpu))
output_used = len(out.clients) > 0
output_goes_to_gpu = all([c[0] != "output" and
isinstance(c[0].op, GpuFromHost)
for c in out.clients])
if input_is_from_gpu and output_used and not output_goes_to_gpu:
# The op should be applied on the GPU version of the input
new_inputs.append(inp.owner.inputs[0])
input_transfered.append(True)
elif not input_is_from_gpu and output_used and output_goes_to_gpu:
# The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp))
input_transfered.append(True)
else:
# Both are on the gpu or on the host. No transfer is required.
new_inputs.append(inp)
input_transfered.append(False)
# Only continue the optimization if at least one input has been
# transfered to the gpu
if not any(input_transfered):
return False
# Apply the op on the new inputs
new_op_outputs = node.op(*new_inputs)
# Ensure that new_op_outputs is a list of outputs (in case the op has
# only one output)
if not isinstance(new_op_outputs, list):
new_op_outputs = [new_op_outputs]
# Propagate the transfer to the gpu through the outputs that require
# it
new_outputs = []
for i in range(len(new_op_outputs)):
if input_transfered[i]:
new_outputs.append(host_from_gpu(new_op_outputs[i]))
else:
new_outputs.append(new_op_outputs[i])
return new_outputs
return False
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
def local_gpua_join(node): def local_gpua_join(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论