提交 421b712f authored 作者: nouiz's avatar nouiz

Merge pull request #604 from lamblin/debugmode_preallocated_output

Improved testing of preallocated outputs in DebugMode
......@@ -63,7 +63,10 @@ Reference
This mode catches several kinds of internal error:
- inconsistent c_code and perform implementations (see `BadCLinkerOutput`)
- inconsistent outputs when calling the same Op twice with the same
inputs, for instance if c_code and perform implementations, are
inconsistent, or in case of incorrect handling of output memory
(see `BadThunkOutput`)
- a variable replacing another when their runtime values don't match. This is a symptom of
an incorrect optimization step, or faulty Op implementation (raises `BadOptimization`)
......@@ -144,11 +147,17 @@ There following are DebugMode exceptions you might encounter:
.. class:: BadCLinkerOutput(DebugModeError)
.. class:: BadThunkOutput(DebugModeError)
This exception means that python (``perform``) and c (``c_code``) for an Op
didn't compute the same thing like they were supposed to.
The problem might be a bug in either ``perform`` or ``c_code`` (or both).
This exception means that different calls to the same Op with the same
inputs did not compute the same thing like they were supposed to.
For instance, it can happen if the python (``perform``) and c (``c_code``)
implementations of the Op are inconsistent (the problem might be a bug in
either ``perform`` or ``c_code`` (or both)). It can also happen if
``perform`` or ``c_code`` does not handle correctly output memory that
has been preallocated (for instance, if it did not clear the memory before
accumulating into it, or if it assumed the memory layout was C-contiguous
even if it is not).
......
......@@ -385,6 +385,8 @@ import theano and print the config variable, as in:
A list of kinds of preallocated memory to use as output buffers for
each Op's computations, separated by ``:``. Implemented modes are:
* ``"initial"``: initial storage present in storage map
(for instance, it can happen in the inner function of Scan),
* ``"previous"``: reuse previously-returned memory,
* ``"c_contiguous"``: newly-allocated C-contiguous memory,
* ``"f_contiguous"``: newly-allocated Fortran-contiguous memory,
......@@ -394,6 +396,15 @@ import theano and print the config variable, as in:
In order not to test with preallocated memory, use an empty string, ``""``.
.. attribute:: config.DebugMode.check_preallocated_output_ndim
Positive int value, default: 4.
When testing with "strided" preallocated output memory, test
all combinations of strides over that number of (inner-most)
dimensions. You may want to reduce that number to reduce memory or
time usage, but it is advised to keep a minimum of 2.
.. attribute:: config.DebugMode.warn_input_not_reused
Bool value, default: True
......
......@@ -16,7 +16,7 @@ from theano import gof
from theano.gof import Env, graph, utils, link, ops_with_inner_function
from theano.gof.link import raise_with_op
from theano.gof.cc import CLinker
from theano.gof.python25 import any, product as itertools_product
from theano.gof.python25 import all, any, product as itertools_product
from theano.configparser import (config, AddConfigVar, BoolParam, IntParam,
StrParam)
from theano.compile.function_module import (FunctionMaker,
......@@ -64,7 +64,7 @@ AddConfigVar('DebugMode.warn_input_not_reused',
def is_valid_check_preallocated_output_param(param):
if not isinstance(param, basestring):
return False
valid = ["previous", "c_contiguous", "f_contiguous",
valid = ["initial", "previous", "c_contiguous", "f_contiguous",
"strided", "wrong_size", "ALL", ""]
for p in param.split(":"):
if p not in valid:
......@@ -74,6 +74,7 @@ def is_valid_check_preallocated_output_param(param):
AddConfigVar('DebugMode.check_preallocated_output',
('Test thunks with pre-allocated memory as output storage. '
'This is a list of strings separated by ":". Valid values are: '
'"initial" (initial storage in storage map, happens with Scan),'
'"previous" (previously-returned memory), '
'"c_contiguous", "f_contiguous", '
'"strided" (positive and negative strides), '
......@@ -82,6 +83,15 @@ AddConfigVar('DebugMode.check_preallocated_output',
StrParam('', is_valid=is_valid_check_preallocated_output_param),
in_c_key=False)
AddConfigVar('DebugMode.check_preallocated_output_ndim',
('When testing with "strided" preallocated output memory, '
'test all combinations of strides over that number of '
'(inner-most) dimensions. You may want to reduce that number '
'to reduce memory or time usage, but it is advised to keep a '
'minimum of 2.'),
IntParam(4, lambda i: i > 0),
in_c_key=False)
import logging
_logger = logging.getLogger("theano.compile.debugmode")
_logger.setLevel(logging.WARNING)
......@@ -114,24 +124,35 @@ class DebugModeError(Exception):
pass
class BadCLinkerOutput(DebugModeError):
"""Exception: an Op's c_code and perform implementations don't agree."""
class BadThunkOutput(DebugModeError):
"""
Exception: Calling the same Op twice gives inconsistent outputs.
It can be raised, for instance, if an Op's c_code and perform method
do not agree, or if one of these methods do not give the same result
when called twice with the same inputs (but different memory layouts
for the output).
"""
r = None
"""The `Variable` instance for which conflicting values were computed"""
val_py = None
"""The value computed by `r.owner.op.perform`"""
thunk1 = ''
val1 = None
"""The value computed by `thunk1`"""
val_c = None
"""The value computed by `r.owner.op.c_code`"""
thunk2 = ''
val2 = None
"""The value computed by `thunk2`"""
def __init__(self, r, val_py, val_c):
def __init__(self, r, thunk1, val1, thunk2, val2):
"""Initialize members"""
DebugModeError.__init__(self) # to be compatible with python2.4
self.r = r
self.val_py = val_py
self.val_c = val_c
self.thunk1 = thunk1
self.val1 = val1
self.thunk2 = thunk2
self.val2 = val2
def offending_op(self):
"""Return the Op class whose c_code and perform
......@@ -145,45 +166,47 @@ class BadCLinkerOutput(DebugModeError):
"""Return a pretty multiline string representating the cause
of the exception"""
sio = StringIO()
print >> sio, "BadCLinkerOutput"
print >> sio, " variable:", self.r
print >> sio, " Outputs Type :", self.r.type
print >> sio, " Inputs Type:", [i.type for i in self.r.owner.inputs]
print >> sio, "BadThunkOutput"
print >> sio, " variable :", self.r
print >> sio, " Outputs Type:", self.r.type
print >> sio, " Inputs Type :", [i.type for i in self.r.owner.inputs]
print >> sio, " Apply :", self.r.owner
print >> sio, " val_py :", self.val_py
print >> sio, " val_c :", self.val_c
print >> sio, " thunk1 :", self.thunk1
print >> sio, " thunk2 :", self.thunk2
print >> sio, " val1 :", self.val1
print >> sio, " val2 :", self.val2
print >> sio, " op :", self.offending_op()
try:
ssio = StringIO()
print >> ssio, " PyValue shape, dtype, strides, min, max, n_inf, n_nan:",
print >> ssio, self.val_py.shape,
print >> ssio, self.val_py.dtype,
print >> ssio, self.val_py.strides,
print >> ssio, self.val_py.min(),
print >> ssio, self.val_py.max(),
print >> ssio, numpy.isinf(self.val_py).sum(),
print >> ssio, numpy.isnan(self.val_py).sum(),
print >> ssio, " Value 1 : shape, dtype, strides, min, max, n_inf, n_nan:",
print >> ssio, self.val1.shape,
print >> ssio, self.val1.dtype,
print >> ssio, self.val1.strides,
print >> ssio, self.val1.min(),
print >> ssio, self.val1.max(),
print >> ssio, numpy.isinf(self.val1).sum(),
print >> ssio, numpy.isnan(self.val1).sum(),
# only if all succeeds to we add anything to sio
print >> sio, ssio.getvalue()
except Exception:
pass
try:
ssio = StringIO()
print >> ssio, " CValue shape, dtype, strides, min, max, n_inf, n_nan:",
print >> ssio, self.val_c.shape,
print >> ssio, self.val_c.dtype,
print >> ssio, self.val_c.strides,
print >> ssio, self.val_c.min(),
print >> ssio, self.val_c.max(),
print >> ssio, numpy.isinf(self.val_c).sum(),
print >> ssio, numpy.isnan(self.val_c).sum(),
print >> ssio, " Value 2 : shape, dtype, strides, min, max, n_inf, n_nan:",
print >> ssio, self.val2.shape,
print >> ssio, self.val2.dtype,
print >> ssio, self.val2.strides,
print >> ssio, self.val2.min(),
print >> ssio, self.val2.max(),
print >> ssio, numpy.isinf(self.val2).sum(),
print >> ssio, numpy.isnan(self.val2).sum(),
# only if all succeeds to we add anything to sio
print >> sio, ssio.getvalue()
except Exception:
pass
try:
ov = numpy.asarray(self.val_c)
nv = numpy.asarray(self.val_py)
ov = numpy.asarray(self.val1)
nv = numpy.asarray(self.val2)
ssio = StringIO()
absdiff = numpy.absolute(nv - ov)
print >> ssio, " Max Abs Diff: ", numpy.max(absdiff)
......@@ -670,18 +693,27 @@ def _optcheck_env(input_specs, output_specs, accept_inplace=False):
def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
clobber_dr_vals=True,
perform=None, warn_input_not_reused=True):
"""Raise BadDestroyMap if necessary, update dr_vals"""
"""
Raise BadDestroyMap if necessary, update dr_vals
Returns a list of output variables that actually worked inplace
(their value is aliased to the value of at least one input).
"""
destroyed_idx_list = []
destroy_map = getattr(node.op, 'destroy_map', {})
for o_pos, i_pos_list in destroy_map.iteritems():
destroyed_idx_list.extend(i_pos_list)
destroyed_res_list = [node.inputs[i] for i in destroyed_idx_list]
if warn_input_not_reused and destroyed_res_list:
dmap = getattr(node.op, 'destroy_map', {})
for oo, ii in dmap.iteritems():
out_var = storage_map[node.outputs[oo]][0]
in_var = storage_map[node.inputs[ii[0]]][0]
actually_inplace_outputs = []
dmap = getattr(node.op, 'destroy_map', {})
for oo, ii in dmap.iteritems():
out_var = storage_map[node.outputs[oo]][0]
in_var = storage_map[node.inputs[ii[0]]][0]
if _may_share_memory(out_var, in_var):
actually_inplace_outputs.append(node.outputs[oo])
if warn_input_not_reused and destroyed_res_list:
if isinstance(node.op, theano.compile.mode.OutputGuard):
# The point of OutputGuard is to be declared as destructive
# while not destroying anything
......@@ -691,11 +723,14 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
"as destroyed was not changed for node '%s'",
ii[0], str(node))
if warn_input_not_reused:
vmap = getattr(node.op, 'view_map', {})
for oo, ii in vmap.iteritems():
out_var = storage_map[node.outputs[oo]][0]
in_var = storage_map[node.inputs[ii[0]]][0]
vmap = getattr(node.op, 'view_map', {})
for oo, ii in vmap.iteritems():
out_var = storage_map[node.outputs[oo]][0]
in_var = storage_map[node.inputs[ii[0]]][0]
if _may_share_memory(out_var, in_var):
actually_inplace_outputs.append(node.outputs[oo])
if warn_input_not_reused:
# We don't try to optimize simple scalar and empty ndarray,
# as this is not worth our time. This happen at least in
# Subtensor when the output is a scalar But this depend on
......@@ -727,6 +762,8 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
raise BadDestroyMap(node, r_idx, r_vals[r],
storage_map[r][0], perform)
return actually_inplace_outputs
def _check_viewmap(node, storage_map):
"""
......@@ -994,7 +1031,8 @@ _find_bad_optimizations = _find_bad_optimizations0
def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
storage_map, r_vals, dr_vals, perform, active_order_set):
storage_map, r_vals, dr_vals, perform, active_order_set,
inplace_outs, init_outputs):
'''Preallocate outputs in different memory layouts'''
# To avoid circular imports
......@@ -1004,21 +1042,49 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
from theano.sandbox.cuda import CudaNdarray
from theano.sandbox.cuda import dimshuffle as cuda_dimshuffle
# TODO: Sparse, Scalar
# TODO: Sparse? Scalar does not really make sense.
# Do not preallocate memory for outputs that actually work inplace
considered_outputs = []
for r in node.outputs:
if r not in inplace_outs:
considered_outputs.append(r)
# Output storage that was initially present in the storage_map
if 'initial' in prealloc_modes or 'ALL' in prealloc_modes:
initial_outputs = {}
for r in considered_outputs:
if r in init_outputs:
initial_outputs[r] = init_outputs[r]
if initial_outputs:
yield ('initial', initial_outputs)
# reuse_output: use a copy of the same storage returned the first time
# TODO: optimization warning if the storage in reuse_outputs
# is not reused
if 'previous' in prealloc_modes or 'ALL' in prealloc_modes:
reuse_outputs = {}
for r in node.outputs:
for r in considered_outputs:
# We want to reuse the exact same memory buffer,
# so we keep the copy in r_vals
new_r = _lessbroken_deepcopy(r_vals[r])
reuse_outputs[r] = r_vals[r]
r_vals[r] = new_r
# Sometimes, outputs can be aliased together.
# I'm not sure why it is legitimate, but there are tests about it.
# So, we cannot fill r_vals[r] with def_val yet, we have to wait
# until all output values are deepcopied.
for r in considered_outputs:
# There is no risk to overwrite inputs, since r does not work
# inplace.
if isinstance(r.type, (TensorType, CudaNdarrayType)):
reuse_outputs[r][...] = numpy.asarray(
def_val).astype(r.type.dtype)
yield ('previous', reuse_outputs)
if reuse_outputs:
yield ('previous', reuse_outputs)
# clear memory that is not needed any more
del reuse_outputs
......@@ -1026,13 +1092,13 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# (for TensorType and CudaNdarray, else None)
if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
c_cont_outputs = {}
for r in node.outputs:
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Build a C-contiguous buffer
new_buf = r.type.value_zeros(r_vals[r].shape)
# CudaNdarray don't have flags field
# assert new_buf.flags["C_CONTIGUOUS"]
new_buf += numpy.asarray(def_val).astype(r.type.dtype)
new_buf[...] = numpy.asarray(def_val).astype(r.type.dtype)
c_cont_outputs[r] = new_buf
......@@ -1044,13 +1110,13 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# (for TensorType, only)
if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
f_cont_outputs = {}
for r in node.outputs:
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
new_buf = numpy.zeros(
shape=r_vals[r].shape,
dtype=r_vals[r].dtype,
order='F')
new_buf += def_val
new_buf[...] = def_val
if isinstance(r.type, CudaNdarrayType):
# When the CudaNdarray is built, the underlying memory
# is c-contiguous, so we transpose it before and after.
......@@ -1067,34 +1133,79 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# We assume that the different outputs of a same Op will behave
# independently, and there is no need to test over all combinations
# of outputs (the time taken is prohibitive).
# When all outputs on a certain dimension are broadcastable, the Op
# can assume that the shape is 1 on that dimension, and stride testing
# is less relevant.
# Dimensions should be align by the innermost index, so we iterate
# from the end of shapes.
max_ndim = 0
for r in node.outputs:
rev_out_broadcastable = []
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
max_ndim = max(max_ndim, r.ndim)
if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim
assert len(rev_out_broadcastable) == max_ndim
for i, b in enumerate(r.broadcastable[::-1]):
rev_out_broadcastable[i] = rev_out_broadcastable[i] and b
out_broadcastable = rev_out_broadcastable[::-1]
if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
check_ndim = config.DebugMode.check_preallocated_output_ndim
# Initial allocation
init_strided = {}
for r in node.outputs:
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Create a buffer twice as large in every dimension
new_buf = r.type.value_zeros(
[(s * 2) for s in r_vals[r].shape])
# Create a buffer twice as large in every dimension,
# except if broadcastable, or for dimensions above
# config.DebugMode.check_preallocated_output_ndim
buf_shape = []
for s, b in zip(r_vals[r].shape, r.broadcastable):
if b or ((r.ndim - len(buf_shape)) > check_ndim):
buf_shape.append(s)
else:
buf_shape.append(s * 2)
new_buf = r.type.value_zeros(buf_shape)
new_buf[...] = numpy.asarray(def_val).astype(r.type.dtype)
init_strided[r] = new_buf
for step_signs in itertools_product((-1, 1), repeat=max_ndim):
# The number of combinations is exponential in the number of
# dimensions, and some ops can have tens of outputs. To prevent
# tests from lasting days, we use the same strides for all
# dimensions but the last check_ndim ones.
# Moreover, to avoid memory problems, we do not test with strides
# 2 and -2 on those dimensions.
step_signs_list = []
for b in out_broadcastable[-check_ndim:]:
if b:
step_signs_list.append((1,))
else:
step_signs_list.append((-1, 1))
# Use the same step on all dimensions before the last check_ndim.
if all(out_broadcastable[:-check_ndim]):
step_signs_list = [(1,)] + step_signs_list
else:
step_signs_list = [(-1, 1)] + step_signs_list
for step_signs in itertools_product(*step_signs_list):
for step_size in (1, 2):
strided = {}
steps = [s * step_size for s in step_signs]
# First, the dimensions above check_ndim, then the other ones
# Do not test with 2 or -2 for dimensions above check_ndim
steps = [step_signs[0]] * len(out_broadcastable[:-check_ndim])
steps += [s * step_size for s in step_signs[1:]]
name = 'strided%s' % str(tuple(steps))
for r in node.outputs:
for r in considered_outputs:
if r in init_strided:
# Build lists of slices, for strides and shapes
strides = []
shapes = []
for i, size in enumerate(r_vals[r].shape):
strides.append(slice(None, None, steps[i]))
shapes.append(slice(None, size, None))
strides.append(slice(None, None, steps[i]))
r_buf = init_strided[r]
......@@ -1103,15 +1214,19 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
assert r_buf.shape == r_vals[r].shape
r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
strided[r] = r_buf
yield (name, strided)
if strided:
yield (name, strided)
del strided
if 'wrong_size' in prealloc_modes or 'ALL' in prealloc_modes:
# For each dimension, try size-1, size, size+1
for dim in xrange(max_ndim):
for dim, b in enumerate(out_broadcastable):
if b:
# The shape has to be 1
continue
shape_diff = [0] * max_ndim
for diff in (-1, 1):
shape_diff[dim] = diff
......@@ -1119,22 +1234,25 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
wrong_size = {}
name = 'wrong_size%s' % str(tuple(shape_diff))
for r in node.outputs:
for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
r_shape_diff = shape_diff[:r.ndim]
out_shape = [max((s + sd), 0)
for s, sd in zip(r_vals[r].shape,
r_shape_diff)]
new_buf = r.type.value_zeros(r_vals[r].shape)
new_buf += numpy.asarray(def_val).astype(r.type.dtype)
new_buf = r.type.value_zeros(out_shape)
new_buf[...] = numpy.asarray(
def_val).astype(r.type.dtype)
wrong_size[r] = new_buf
yield (name, wrong_size)
if wrong_size:
yield (name, wrong_size)
del wrong_size
def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
storage_map, r_vals, dr_vals, perform, active_order_set):
storage_map, r_vals, dr_vals, perform, active_order_set,
inplace_outs, init_outputs):
'''Try to apply thunk() on different output storages'''
# If node has an inner compiled Theano function with mode DebugMode,
......@@ -1163,17 +1281,33 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
changed_inner_mode = True
_logger.info('changing inner mode')
# Set of inputs that are marked as destroyed or viewed
aliased_inputs = set()
dmap = getattr(node.op, 'destroy_map', {})
vmap = getattr(node.op, 'view_map', {})
for i, r in enumerate(node.inputs):
if any(i in v for v in (dmap.values() + vmap.values())):
aliased_inputs.add(r)
_logger.debug('starting preallocated output checking')
for (name, out_map) in _get_preallocated_maps(
node, thunk, prealloc_modes, def_val, storage_map, r_vals,
dr_vals, perform, active_order_set):
dr_vals, perform, active_order_set, inplace_outs,
init_outputs):
_logger.debug(' name = %s', name)
# Copy the inputs over, if they were marked as destroyed
dmap = getattr(node.op, 'destroy_map', {})
for i, r in enumerate(node.inputs):
if any(i in v for v in dmap.values()):
storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
thunk_name = '%s with %s output' % (perform, name)
if not out_map:
# Map is empty, there is no need to execute thunk() again
_logger.warn('%s: out_map is empty', name)
continue
# Copy the inputs over, if they were marked as destroyed or viewed
# (we will destroy the output at some point so it can destroy
# the input)
for r in aliased_inputs:
storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
# Get the appropriate output storages
# (no copy)
......@@ -1186,13 +1320,13 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
for r in node.outputs:
if not r.type.is_valid_value(storage_map[r][0]):
raise InvalidValueError(r, storage_map[r][0],
hint='%s with %s output' % (perform, name),
hint=thunk_name,
specific_hint=r.type.value_validity_msg(
storage_map[r][0]))
_check_inputs(node, storage_map, r_vals, dr_vals, active_order_set,
clobber_dr_vals=False,
perform='%s with output %s' % (perform, name),
perform=thunk_name,
warn_input_not_reused=False)
_check_viewmap(node, storage_map)
......@@ -1200,8 +1334,9 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
for r in node.outputs:
if not r.type.values_eq_approx(r_vals[r], storage_map[r][0]):
# TODO: indicate it is not a C/Py problem
raise BadCLinkerOutput(r, val_py=r_vals[r],
val_c=storage_map[r][0])
raise BadThunkOutput(r,
thunk1='Reference value', val1=r_vals[r],
thunk2=thunk_name, val2=storage_map[r][0])
# Clear storage_map
for r in node.outputs:
......@@ -1617,11 +1752,14 @@ class _Linker(gof.link.LocalLinker):
storage_map[r][0] = None
r_vals_initialized.append(r)
# TODO: store them in another map, and test the thunks on
# store preallocated outputs in another map, and test the thunks on
# them as output storages.
init_outputs = {}
for r in storage_map:
if r in env.outputs:
storage_map[r][0] = None
if storage_map[r][0] is not None:
init_outputs[r] = storage_map[r][0]
storage_map[r][0] = None
#####
# Precondition: the storage map is empty, transferred
......@@ -1673,11 +1811,11 @@ class _Linker(gof.link.LocalLinker):
raise InvalidValueError(r, storage_map[r][0],
hint='perform output',
specific_hint=hint2)
_check_inputs(node, storage_map, r_vals, dr_vals,
active_order_set,
clobber_dr_vals=True, perform='py',
warn_input_not_reused=config.DebugMode.warn_input_not_reused)
py_inplace_outs = _check_inputs(
node, storage_map, r_vals, dr_vals,
active_order_set,
clobber_dr_vals=True, perform='py',
warn_input_not_reused=config.DebugMode.warn_input_not_reused)
_check_viewmap(node, storage_map)
......@@ -1705,7 +1843,9 @@ class _Linker(gof.link.LocalLinker):
r_vals=r_vals,
dr_vals=dr_vals,
perform='py',
active_order_set=active_order_set)
active_order_set=active_order_set,
inplace_outs=py_inplace_outs,
init_outputs=init_outputs)
# print >> sys.stderr, i, "DEBUGMODE thunk_py %100s %50s %30s" % (node,
#[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs],
......@@ -1717,6 +1857,7 @@ class _Linker(gof.link.LocalLinker):
clobber = True
if thunk_py:
dmap = getattr(node.op, 'destroy_map', {})
vmap = getattr(node.op, 'view_map', {})
for i, r in enumerate(node.inputs):
# if thunk_py ran, and we still got this far,
# it means that the destroy_map of the Op (and view_map) are
......@@ -1725,7 +1866,10 @@ class _Linker(gof.link.LocalLinker):
# fact not been destroyed.
# Therefore... we only need to overwrite inputs that *have*
# been marked as destroyed.
if any(i in v for v in dmap.values()):
# Inputs marked as viewd are unsafe too,
# because the corresponding output can
# be destroyed.
if any(i in v for v in (dmap.values() + vmap.values())):
storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
clobber = False
......@@ -1750,10 +1894,11 @@ class _Linker(gof.link.LocalLinker):
self.maker.mode.require_matching_strides,
node.op)
_check_inputs(node, storage_map, r_vals,
dr_vals, active_order_set,
clobber_dr_vals=clobber, perform='c',
warn_input_not_reused=config.DebugMode.warn_input_not_reused)
c_inplace_outs = _check_inputs(
node, storage_map, r_vals,
dr_vals, active_order_set,
clobber_dr_vals=clobber, perform='c',
warn_input_not_reused=config.DebugMode.warn_input_not_reused)
_check_viewmap(node, storage_map)
......@@ -1766,7 +1911,9 @@ class _Linker(gof.link.LocalLinker):
if not r.type.values_eq_approx(r_vals[r], storage_map[r][0]):
#import pdb; pdb.set_trace()
#r.type.values_eq_approx(r_vals[r], storage_map[r][0])
raise BadCLinkerOutput(r, val_py=r_vals[r], val_c=storage_map[r][0])
raise BadThunkOutput(r,
thunk1='perform', val1=r_vals[r],
thunk2='c_code', val2=storage_map[r][0])
else:
#print >> sys.stderr, i, "DEBUGMODE storing reference output %x" % id(storage_map[r][0])
#retrieve each output from the storage_map
......@@ -1793,7 +1940,9 @@ class _Linker(gof.link.LocalLinker):
r_vals=r_vals,
dr_vals=dr_vals,
perform='c code',
active_order_set=active_order_set)
active_order_set=active_order_set,
inplace_outs=c_inplace_outs,
init_outputs=init_outputs)
# print >> sys.stderr, i, "DEBUGMODE thunk_c %100s %50s %30s" % (node,
#[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs],
......@@ -2176,7 +2325,10 @@ class DebugMode(Mode):
This mode catches several kinds of internal error:
- inconsistent c_code and perform implementations (see `BadCLinkerOutput`)
- inconsistent outputs when calling the same Op twice with the same
inputs, for instance if c_code and perform implementations, are
inconsistent, or in case of incorrect handling of output memory
(see `BadThunkOutput`),
- a variable replacing another when their runtime values don't
match. This is a symptom of an incorrect optimization step, or
......
from nose.plugins.skip import SkipTest
import unittest
import numpy
from theano import config
......@@ -7,7 +10,6 @@ import theano.tensor
from theano.compile import debugmode
import theano.compile
from theano.tests import unittest_tools as utt
import unittest
def test0():
......@@ -194,7 +196,7 @@ wb1i = WeirdBrokenOp('times1_inplace')
wb1 = WeirdBrokenOp('times1')
def test_badclinkeroutput():
def test_badthunkoutput():
a = theano.tensor.dvector()
b = theano.tensor.dvector()
......@@ -210,7 +212,7 @@ def test_badclinkeroutput():
f_good([1.0, 2.0, 3.0], [2, 3, 4])
try:
f_inconsistent([1.0, 2.0, 3.0], [2, 3, 4])
except debugmode.BadCLinkerOutput, e:
except debugmode.BadThunkOutput, e:
#print repr(e)
assert e.r.owner.op is inconsistent
return # TEST PASS
......@@ -651,7 +653,48 @@ class BrokenCImplementationAdd(gof.Op):
""" % dict(locals(), **sub)
class VecAsRowAndCol(gof.Op):
"""
Transforms a vector into a row and a column.
This Op exists to check everything is correct when an Op has
two outputs with different broadcasting patterns.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, v):
if not isinstance(v, gof.Variable):
v = theano.tensor.as_tensor_variable(v)
assert v.type.ndim == 1
type_class = type(v.type)
out_r_type = type_class(dtype=v.dtype, broadcastable=(True, False))
out_c_type = type_class(dtype=v.dtype, broadcastable=(False, True))
return gof.Apply(self, [v], [out_r_type(), out_c_type()])
def perform(self, node, inp, out):
v, = inp
r, c = out
lv = v.shape[0]
if (r[0] is None) or (r[0].shape != (1, lv)):
r[0] = node.outputs[0].type.value_zeros((1, lv))
if (c[0] is None) or (c[0].shape != (lv, 1)):
c[0] = node.outputs[1].type.value_zeros((lv, 1))
# Python loop because CudaNdarrays do not support newaxis
for i in range(lv):
r[0][0, i] = v[i]
c[0][i, 0] = v[i]
class Test_preallocated_output(unittest.TestCase):
def setUp(self):
self.rng = numpy.random.RandomState(seed=utt.fetch_seed())
def test_f_contiguous(self):
a = theano.tensor.fmatrix('a')
......@@ -660,30 +703,42 @@ class Test_preallocated_output(unittest.TestCase):
# Needed so that z is not the output of the graph
out = theano.tensor.dot(z, numpy.eye(7))
rng = numpy.random.RandomState(seed=utt.fetch_seed())
a_val = rng.randn(7, 7).astype('float32')
b_val = rng.randn(7, 7).astype('float32')
a_val = self.rng.randn(7, 7).astype('float32')
b_val = self.rng.randn(7, 7).astype('float32')
init_conf_val = config.DebugMode.check_preallocated_output
try:
# Should work
config.DebugMode.check_preallocated_output = 'c_contiguous'
f = theano.function([a, b], out, mode='DEBUG_MODE')
out_val = f(a_val, b_val)
#print 'out_val =', out_val
#print out_val.strides
# Should work for now (0.4.0), because the C thunk does not care
# at all of what is in storage_map initially.
# When it changes, the call to f should raise an Exception,
# since the output buffer is used incorrectly.
config.DebugMode.check_preallocated_output = 'f_contiguous'
f = theano.function([a, b], out, mode='DEBUG_MODE')
out_val = f(a_val, b_val)
#print 'out_val =', out_val
#print out_val.strides
finally:
config.DebugMode.check_preallocated_output = init_conf_val
# Should work
mode = debugmode.DebugMode(
check_preallocated_output=['c_contiguous'])
f = theano.function([a, b], out, mode=mode)
out_val = f(a_val, b_val)
#print 'out_val =', out_val
#print out_val.strides
# Should raise an Exception, since the output buffer is
# used incorrectly.
mode = debugmode.DebugMode(
check_preallocated_output=['f_contiguous'])
f = theano.function([a, b], out, mode=mode)
self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
def test_output_broadcast_tensor(self):
v = theano.tensor.fvector('v')
c, r = VecAsRowAndCol()(v)
f = theano.function([v], [c, r])
v_val = self.rng.randn(5).astype('float32')
f(v_val)
def test_output_broadcast_cuda(self):
from theano.sandbox import cuda
if not cuda.cuda_available:
raise SkipTest("Optional package Cuda disabled")
v = cuda.fvector('v')
c, r = VecAsRowAndCol()(v)
f = theano.function([v], [c, r])
v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32'))
f(v_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论