提交 e6a4c073 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2423 from carriepl/breakpoint

Add breakpoint Op to Theano
...@@ -63,6 +63,7 @@ from theano.tensor import nlinalg ...@@ -63,6 +63,7 @@ from theano.tensor import nlinalg
from theano.tensor import slinalg from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
from theano.tests.breakpoint import PdbBreakpoint
try: try:
# We need to be able to import this file even if cuda isn't avail. # We need to be able to import this file even if cuda isn't avail.
...@@ -1141,6 +1142,69 @@ def local_gpu_print_op(node): ...@@ -1141,6 +1142,69 @@ def local_gpu_print_op(node):
return False return False
@register_opt()
@local_optimizer([PdbBreakpoint])
def local_gpu_pdbbreakpoint_op(node):
if isinstance(node.op, PdbBreakpoint):
old_inputs = node.inputs
old_outputs = node.outputs
new_inputs = node.inputs[:1]
input_transfered = []
# Go through the monitored variables, only transfering on GPU those
# for which the input comes from the GPU or the output will be
# transfered on the GPU.
nb_monitored_vars = len(node.outputs)
for i in range(nb_monitored_vars):
inp = old_inputs[i+1]
out = old_outputs[i]
input_is_from_gpu = (inp.owner and
isinstance(inp.owner.op, HostFromGpu))
output_goes_to_gpu = any([c[0] != "output" and
isinstance(c[0].op, GpuFromHost)
for c in out.clients])
if input_is_from_gpu:
# The op should be applied on the GPU version of the input
new_inputs.append(inp.owner.inputs[0])
input_transfered.append(True)
elif output_goes_to_gpu:
# The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp))
input_transfered.append(True)
else:
# No transfer is required.
new_inputs.append(inp)
input_transfered.append(False)
# Only continue the optimization if at least one input has been
# transfered to the gpu
if not any(input_transfered):
return False
# Apply the op on the new inputs
new_op_outputs = node.op(*new_inputs, return_list=True)
# Propagate the transfer to the gpu through the outputs that require
# it
new_outputs = []
for i in range(len(new_op_outputs)):
if input_transfered[i]:
new_outputs.append(host_from_gpu(new_op_outputs[i]))
else:
new_outputs.append(new_op_outputs[i])
return new_outputs
return False
def cast(x, dtype): def cast(x, dtype):
stype = scal.Scalar(dtype) stype = scal.Scalar(dtype)
cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype))) cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))
...@@ -2303,6 +2367,7 @@ def local_gpu_allocempty(node): ...@@ -2303,6 +2367,7 @@ def local_gpu_allocempty(node):
return [ret] return [ret]
return False return False
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor, scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
gpu_flag=True), gpu_flag=True),
......
...@@ -14,6 +14,7 @@ from theano import config, tensor ...@@ -14,6 +14,7 @@ from theano import config, tensor
import theano.tensor.tests.test_nlinalg import theano.tensor.tests.test_nlinalg
import theano.tensor.tests.test_opt as test_opt import theano.tensor.tests.test_opt as test_opt
from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
...@@ -164,7 +165,7 @@ def test_gpuallocempty(): ...@@ -164,7 +165,7 @@ def test_gpuallocempty():
f_cpu = theano.function([], tensor.AllocEmpty('int32')(2,3)) f_cpu = theano.function([], tensor.AllocEmpty('int32')(2,3))
l_cpu = f_cpu.maker.fgraph.toposort() l_cpu = f_cpu.maker.fgraph.toposort()
assert not numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu]) assert not numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu])
class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc): class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
dtype = 'float32' dtype = 'float32'
...@@ -322,7 +323,7 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -322,7 +323,7 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
def test_opt_gpujoin_joinvectors_negativeaxes(): def test_opt_gpujoin_joinvectors_negativeaxes():
""" """
Test that negative axis concatenation works as expected. Test that negative axis concatenation works as expected.
""" """
...@@ -477,6 +478,25 @@ def test_print_op(): ...@@ -477,6 +478,25 @@ def test_print_op():
f(numpy.random.random((5, 5)).astype('float32')) f(numpy.random.random((5, 5)).astype('float32'))
def test_pdbbreakpoint_op():
""" Test that PdbBreakpoint ops don't block gpu optimization"""
b = tensor.fmatrix()
# Create a function composed of a breakpoint followed by
# some computation
condition = tensor.gt(b.sum(), 0)
b_monitored = PdbBreakpoint(name='TestBreakpoint')(condition, b)
output = b_monitored ** 2
f = theano.function([b], output, mode=mode_with_gpu)
# Ensure that, in the compiled function, the computation following the
# breakpoint has been moved to the gpu.
topo = f.maker.fgraph.toposort()
assert isinstance(topo[-2].op, cuda.GpuElemwise)
assert topo[-1].op == cuda.host_from_gpu
def test_huge_elemwise_fusion(): def test_huge_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly """ Test the the GpuElemwise fusion work correctly
We check that we fuse one node with part of its input We check that we fuse one node with part of its input
......
...@@ -15,6 +15,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, ...@@ -15,6 +15,7 @@ from theano.gof import (local_optimizer, EquilibriumDB,
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant from .type import GpuArrayType, GpuArrayConstant
from .basic_ops import (host_from_gpu, gpu_from_host, from .basic_ops import (host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
...@@ -330,6 +331,69 @@ def local_gpu_print_op(node): ...@@ -330,6 +331,69 @@ def local_gpu_print_op(node):
return new_op(gpu_x) return new_op(gpu_x)
@register_opt('fast_compile')
@local_optimizer([PdbBreakpoint])
def local_gpu_pdbbreakpoint_op(node):
if isinstance(node.op, PdbBreakpoint):
old_inputs = node.inputs
old_outputs = node.outputs
new_inputs = node.inputs[:1]
input_transfered = []
# Go through the monitored variables, only transfering on GPU those
# for which the input comes from the GPU or the output will be
# transfered on the GPU.
nb_monitored_vars = len(node.outputs)
for i in range(nb_monitored_vars):
inp = old_inputs[i+1]
out = old_outputs[i]
input_is_from_gpu = (inp.owner and
isinstance(inp.owner.op, HostFromGpu))
output_goes_to_gpu = any([c[0] != "output" and
isinstance(c[0].op, GpuFromHost)
for c in out.clients])
if input_is_from_gpu:
# The op should be applied on the GPU version of the input
new_inputs.append(inp.owner.inputs[0])
input_transfered.append(True)
elif output_goes_to_gpu:
# The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp))
input_transfered.append(True)
else:
# No transfer is required.
new_inputs.append(inp)
input_transfered.append(False)
# Only continue the optimization if at least one input has been
# transfered to the gpu
if not any(input_transfered):
return False
# Apply the op on the new inputs
new_op_outputs = node.op(*new_inputs, return_list=True)
# Propagate the transfer to the gpu through the outputs that require
# it
new_outputs = []
for i in range(len(new_op_outputs)):
if input_transfered[i]:
new_outputs.append(host_from_gpu(new_op_outputs[i]))
else:
new_outputs.append(new_op_outputs[i])
return new_outputs
return False
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
def local_gpua_join(node): def local_gpua_join(node):
......
...@@ -2,6 +2,7 @@ import numpy ...@@ -2,6 +2,7 @@ import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
...@@ -186,6 +187,25 @@ def test_print_op(): ...@@ -186,6 +187,25 @@ def test_print_op():
f(numpy.random.random((5, 5)).astype('float32')) f(numpy.random.random((5, 5)).astype('float32'))
def test_pdbbreakpoint_op():
""" Test that PdbBreakpoint ops don't block gpu optimization"""
b = tensor.fmatrix()
# Create a function composed of a breakpoint followed by
# some computation
condition = tensor.gt(b.sum(), 0)
b_monitored = PdbBreakpoint(name='TestBreakpoint')(condition, b)
output = b_monitored ** 2
f = theano.function([b], output, mode=mode_with_gpu)
# Ensure that, in the compiled function, the computation following the
# breakpoint has been moved to the gpu.
topo = f.maker.fgraph.toposort()
assert isinstance(topo[-2].op, GpuElemwise)
assert topo[-1].op == host_from_gpu
def test_local_gpu_elemwise_careduce(): def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix() x = theano.tensor.matrix()
o = (x*x).sum() o = (x*x).sum()
......
import numpy
import pdb
import theano
from theano.gof import Op, Apply
from theano.gradient import DisconnectedType
class PdbBreakpoint(Op):
"""
This is an identity-like op with the side effect of enforcing a
conditional breakpoint, inside a theano function, based on a symbolic
scalar condition.
@type name: String
@param name: name of the conditional breakpoint. To be printed when the
breakpoint is activated.
:note: WARNING. At least one of the outputs of the op must be used
otherwise the op will be removed from the Theano graph
due to its outputs being unused
:note: WARNING. Employing the function inside a theano graph can prevent
Theano from applying certain optimizations to improve
performance, reduce memory consumption and/or reduce
numerical instability.
Detailed explanation:
As of 2014-12-01 the PdbBreakpoint op is not known by any
optimization. Setting a PdbBreakpoint op in the middle of a
pattern that is usually optimized out will block the optimization.
Example:
.. code-block:: python
import theano
import theano.tensor as T
from theano.tests.breakpoint import PdbBreakpoint
input = T.fvector()
target = T.fvector()
# Mean squared error between input and target
mse = (input - target) ** 2
# Conditional breakpoint to be activated if the total MSE is higher
# than 100. The breakpoint will monitor the inputs, targets as well
# as the individual error values
breakpointOp = PdbBreakpoint("MSE too high")
condition = T.gt(mse.sum(), 100)
mse, monitored_input, monitored_target = breakpointOp(condition, mse,
input, target)
# Compile the theano function
fct = theano.function([input, target], mse)
# Use the function
print fct([10, 0], [10, 5]) # Will NOT activate the breakpoint
print fct([0, 0], [10, 5]) # Will activate the breakpoint
"""
__props__ = ("name",)
def __init__(self, name):
self.name = name
def make_node(self, condition, *monitored_vars):
# Ensure that condition is a theano tensor
if not isinstance(condition, theano.Variable):
condition = theano.tensor.as_tensor_variable(condition)
# Validate that the condition is a scalar (else it is not obvious how
# is should be evaluated)
assert (condition.ndim == 0)
# Because the user might be tempted to instantiate PdbBreakpoint only
# once and apply it many times on different number of inputs, we must
# create a new instance of the op here, define the instance attributes
# (view_map and var_types) in that instance and then apply it on the
# inputs.
new_op = PdbBreakpoint(name=self.name)
new_op.view_map = {}
new_op.inp_types = []
for i in range(len(monitored_vars)):
# Every output i is a view of the input i+1 because of the input
# condition.
new_op.view_map[i] = [i+1]
new_op.inp_types.append(monitored_vars[i].type)
# Build the Apply node
inputs = [condition] + list(monitored_vars)
outputs = [inp.type() for inp in monitored_vars]
return Apply(op=new_op, inputs=inputs, outputs=outputs)
def perform(self, node, inputs, output_storage):
condition = inputs[0]
if condition:
try:
monitored = [numpy.asarray(inp) for inp in inputs[1:]]
except:
raise ValueError("Some of the inputs to the PdbBreakpoint op "
"'%s' could not be casted to NumPy arrays" %
self.name)
print("\n")
print("-------------------------------------------------")
print("Conditional breakpoint '%s' activated\n" % self.name)
print("The monitored variables are stored, in order,")
print("in the list variable 'monitored' as NumPy arrays.\n")
print("Their contents can be altered and, when execution")
print("resumes, the updated values will be used.")
print("-------------------------------------------------")
pdb.set_trace()
# Take the new values in monitored, cast them back to their
# original type and store them in the output_storage
for i in range(len(output_storage)):
output_storage[i][0] = self.inp_types[i].filter(monitored[i])
else:
# Simply return views on the monitored variables
for i in range(len(output_storage)):
output_storage[i][0] = inputs[i+1]
def grad(self, inputs, output_gradients):
return ([DisconnectedType()()] + output_gradients)
def infer_shape(self, inputs, input_shapes):
# Return the shape of every input but the condition (first input)
return input_shapes[1:]
def connection_pattern(self, node):
nb_inp = len(node.inputs)
nb_out = nb_inp - 1
# First input is connected to no output and every other input n is
# connected to input n-1
connections = [[out_idx == inp_idx - 1 for out_idx in range(nb_out)]
for inp_idx in range(nb_inp)]
return connections
import numpy
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
from theano.tests.breakpoint import PdbBreakpoint
class TestPdbBreakpoint(utt.InferShapeTester):
def setUp(self):
super(TestPdbBreakpoint, self).setUp()
# Sample computation that involves tensors with different numbers
# of dimensions
self.input1 = T.fmatrix()
self.input2 = T.fscalar()
self.output = T.dot((self.input1 - self.input2),
(self.input1 - self.input2).transpose())
# Declare the conditional breakpoint
self.breakpointOp = PdbBreakpoint("Sum of output too high")
self.condition = T.gt(self.output.sum(), 1000)
(self.monitored_input1,
self.monitored_input2,
self.monitored_output) = self.breakpointOp(self.condition,
self.input1,
self.input2, self.output)
def test_infer_shape(self):
input1_value = numpy.arange(6).reshape(2, 3).astype("float32")
input2_value = 10.0
self._compile_and_check([self.input1, self.input2],
[self.monitored_input1,
self.monitored_input2,
self.monitored_output],
[input1_value, input2_value],
PdbBreakpoint)
def test_grad(self):
input1_value = numpy.arange(9).reshape(3, 3).astype("float32")
input2_value = 10.0
grads = [T.grad(self.monitored_input1.sum(), self.input1),
T.grad(self.monitored_input2.sum(), self.input2)]
# Add self.monitored_input1 as an output to the Theano function to
# prevent Theano from optimizing the PdbBreakpoint op out of the
# function graph
fct = theano.function([self.input1, self.input2],
grads + [self.monitored_input1])
gradients = fct(input1_value, input2_value)[:-1]
expected_gradients = [numpy.ones((3, 3), dtype="float32"),
numpy.array(1., dtype="float32")]
for i in range(len(gradients)):
numpy.testing.assert_allclose(gradients[i], expected_gradients[i])
def test_fprop(self):
input1_value = numpy.arange(9).reshape(3, 3).astype("float32")
input2_value = 10.0
fct = theano.function([self.input1, self.input2],
[self.monitored_input1, self.monitored_input2])
output = fct(input1_value, input2_value)
numpy.testing.assert_allclose(output[0], input1_value)
numpy.testing.assert_allclose(output[1], input2_value)
def test_connection_pattern(self):
node = self.monitored_output.owner
connection_pattern = self.breakpointOp.connection_pattern(node)
expected_pattern = [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]
assert connection_pattern == expected_pattern
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论