提交 be9316f0 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Do not test preallocated output for inplace outs

Also, do not return a preallocated map it if is empty, and fill "previous" storage with default value.
上级 4c2d9e04
...@@ -670,18 +670,27 @@ def _optcheck_env(input_specs, output_specs, accept_inplace=False): ...@@ -670,18 +670,27 @@ def _optcheck_env(input_specs, output_specs, accept_inplace=False):
def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
clobber_dr_vals=True, clobber_dr_vals=True,
perform=None, warn_input_not_reused=True): perform=None, warn_input_not_reused=True):
"""Raise BadDestroyMap if necessary, update dr_vals""" """
Raise BadDestroyMap if necessary, update dr_vals
Returns a list of output variables that actually worked inplace
(their value is aliased to the value of at least one input).
"""
destroyed_idx_list = [] destroyed_idx_list = []
destroy_map = getattr(node.op, 'destroy_map', {}) destroy_map = getattr(node.op, 'destroy_map', {})
for o_pos, i_pos_list in destroy_map.iteritems(): for o_pos, i_pos_list in destroy_map.iteritems():
destroyed_idx_list.extend(i_pos_list) destroyed_idx_list.extend(i_pos_list)
destroyed_res_list = [node.inputs[i] for i in destroyed_idx_list] destroyed_res_list = [node.inputs[i] for i in destroyed_idx_list]
if warn_input_not_reused and destroyed_res_list: actually_inplace_outputs = []
dmap = getattr(node.op, 'destroy_map', {}) dmap = getattr(node.op, 'destroy_map', {})
for oo, ii in dmap.iteritems(): for oo, ii in dmap.iteritems():
out_var = storage_map[node.outputs[oo]][0] out_var = storage_map[node.outputs[oo]][0]
in_var = storage_map[node.inputs[ii[0]]][0] in_var = storage_map[node.inputs[ii[0]]][0]
if _may_share_memory(out_var, in_var):
actually_inplace_outputs.append(node.outputs[oo])
if warn_input_not_reused and destroyed_res_list:
if isinstance(node.op, theano.compile.mode.OutputGuard): if isinstance(node.op, theano.compile.mode.OutputGuard):
# The point of OutputGuard is to be declared as destructive # The point of OutputGuard is to be declared as destructive
# while not destroying anything # while not destroying anything
...@@ -691,11 +700,14 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, ...@@ -691,11 +700,14 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
"as destroyed was not changed for node '%s'", "as destroyed was not changed for node '%s'",
ii[0], str(node)) ii[0], str(node))
if warn_input_not_reused: vmap = getattr(node.op, 'view_map', {})
vmap = getattr(node.op, 'view_map', {}) for oo, ii in vmap.iteritems():
for oo, ii in vmap.iteritems(): out_var = storage_map[node.outputs[oo]][0]
out_var = storage_map[node.outputs[oo]][0] in_var = storage_map[node.inputs[ii[0]]][0]
in_var = storage_map[node.inputs[ii[0]]][0] if _may_share_memory(out_var, in_var):
actually_inplace_outputs.append(node.outputs[oo])
if warn_input_not_reused:
# We don't try to optimize simple scalar and empty ndarray, # We don't try to optimize simple scalar and empty ndarray,
# as this is not worth our time. This happen at least in # as this is not worth our time. This happen at least in
# Subtensor when the output is a scalar But this depend on # Subtensor when the output is a scalar But this depend on
...@@ -727,6 +739,8 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, ...@@ -727,6 +739,8 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes,
raise BadDestroyMap(node, r_idx, r_vals[r], raise BadDestroyMap(node, r_idx, r_vals[r],
storage_map[r][0], perform) storage_map[r][0], perform)
return actually_inplace_outputs
def _check_viewmap(node, storage_map): def _check_viewmap(node, storage_map):
""" """
...@@ -994,7 +1008,8 @@ _find_bad_optimizations = _find_bad_optimizations0 ...@@ -994,7 +1008,8 @@ _find_bad_optimizations = _find_bad_optimizations0
def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
storage_map, r_vals, dr_vals, perform, active_order_set): storage_map, r_vals, dr_vals, perform, active_order_set,
inplace_outs):
'''Preallocate outputs in different memory layouts''' '''Preallocate outputs in different memory layouts'''
# To avoid circular imports # To avoid circular imports
...@@ -1006,20 +1021,37 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1006,20 +1021,37 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# TODO: Sparse? Scalar does not really make sense. # TODO: Sparse? Scalar does not really make sense.
# Do not preallocate memory for outputs that actually work inplace
considered_outputs = []
for r in node.outputs:
if r not in inplace_outs:
considered_outputs.append(r)
# reuse_output: use a copy of the same storage returned the first time # reuse_output: use a copy of the same storage returned the first time
# TODO: optimization warning if the storage in reuse_outputs # TODO: optimization warning if the storage in reuse_outputs
# is not reused # is not reused
# TODO: skip all this for outputs that actually worked inplace
if 'previous' in prealloc_modes or 'ALL' in prealloc_modes: if 'previous' in prealloc_modes or 'ALL' in prealloc_modes:
reuse_outputs = {} reuse_outputs = {}
for r in node.outputs: for r in considered_outputs:
# We want to reuse the exact same memory buffer, # We want to reuse the exact same memory buffer,
# so we keep the copy in r_vals # so we keep the copy in r_vals
new_r = _lessbroken_deepcopy(r_vals[r]) new_r = _lessbroken_deepcopy(r_vals[r])
reuse_outputs[r] = r_vals[r] reuse_outputs[r] = r_vals[r]
r_vals[r] = new_r r_vals[r] = new_r
# Sometimes, outputs can be aliased together.
# I'm not sure why it is legitimate, but there are tests about it.
# So, we cannot fill r_vals[r] with def_val yet, we have to wait
# until all output values are deepcopied.
for r in considered_outputs:
# There is no risk to overwrite inputs, since r does not work
# inplace.
if isinstance(r.type, (TensorType, CudaNdarrayType)):
reuse_outputs[r][...] = numpy.asarray(
def_val).astype(r.type.dtype)
yield ('previous', reuse_outputs) if reuse_outputs:
yield ('previous', reuse_outputs)
# clear memory that is not needed any more # clear memory that is not needed any more
del reuse_outputs del reuse_outputs
...@@ -1027,7 +1059,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1027,7 +1059,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# (for TensorType and CudaNdarray, else None) # (for TensorType and CudaNdarray, else None)
if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes: if 'c_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
c_cont_outputs = {} c_cont_outputs = {}
for r in node.outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Build a C-contiguous buffer # Build a C-contiguous buffer
new_buf = r.type.value_zeros(r_vals[r].shape) new_buf = r.type.value_zeros(r_vals[r].shape)
...@@ -1045,7 +1077,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1045,7 +1077,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
# (for TensorType, only) # (for TensorType, only)
if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes: if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
f_cont_outputs = {} f_cont_outputs = {}
for r in node.outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, CudaNdarrayType)):
new_buf = numpy.zeros( new_buf = numpy.zeros(
shape=r_vals[r].shape, shape=r_vals[r].shape,
...@@ -1089,7 +1121,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1089,7 +1121,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
if 'strided' in prealloc_modes or 'ALL' in prealloc_modes: if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
# Initial allocation # Initial allocation
init_strided = {} init_strided = {}
for r in node.outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Create a buffer twice as large in every dimension, # Create a buffer twice as large in every dimension,
# except if broadcastable, or for dimensions above 4 # except if broadcastable, or for dimensions above 4
...@@ -1149,7 +1181,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1149,7 +1181,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype) r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
strided[r] = r_buf strided[r] = r_buf
yield (name, strided) if strided:
yield (name, strided)
del strided del strided
if 'wrong_size' in prealloc_modes or 'ALL' in prealloc_modes: if 'wrong_size' in prealloc_modes or 'ALL' in prealloc_modes:
...@@ -1166,7 +1199,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1166,7 +1199,7 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
wrong_size = {} wrong_size = {}
name = 'wrong_size%s' % str(tuple(shape_diff)) name = 'wrong_size%s' % str(tuple(shape_diff))
for r in node.outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, CudaNdarrayType)):
r_shape_diff = shape_diff[:r.ndim] r_shape_diff = shape_diff[:r.ndim]
out_shape = [max((s + sd), 0) out_shape = [max((s + sd), 0)
...@@ -1177,12 +1210,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1177,12 +1210,14 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
def_val).astype(r.type.dtype) def_val).astype(r.type.dtype)
wrong_size[r] = new_buf wrong_size[r] = new_buf
yield (name, wrong_size) if wrong_size:
yield (name, wrong_size)
del wrong_size del wrong_size
def _check_preallocated_output(node, thunk, prealloc_modes, def_val, def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
storage_map, r_vals, dr_vals, perform, active_order_set): storage_map, r_vals, dr_vals, perform, active_order_set,
inplace_outs):
'''Try to apply thunk() on different output storages''' '''Try to apply thunk() on different output storages'''
# If node has an inner compiled Theano function with mode DebugMode, # If node has an inner compiled Theano function with mode DebugMode,
...@@ -1211,20 +1246,30 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val, ...@@ -1211,20 +1246,30 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
changed_inner_mode = True changed_inner_mode = True
_logger.info('changing inner mode') _logger.info('changing inner mode')
# Set of inputs that are marked as destroyed or viewed
aliased_inputs = set()
dmap = getattr(node.op, 'destroy_map', {})
vmap = getattr(node.op, 'view_map', {})
for i, r in enumerate(node.inputs):
if any(i in v for v in (dmap.values() + vmap.values())):
aliased_inputs.add(r)
_logger.debug('starting preallocated output checking') _logger.debug('starting preallocated output checking')
for (name, out_map) in _get_preallocated_maps( for (name, out_map) in _get_preallocated_maps(
node, thunk, prealloc_modes, def_val, storage_map, r_vals, node, thunk, prealloc_modes, def_val, storage_map, r_vals,
dr_vals, perform, active_order_set): dr_vals, perform, active_order_set, inplace_outs):
_logger.debug(' name = %s', name) _logger.debug(' name = %s', name)
if not out_map:
# Map is empty, there is no need to execute thunk() again
_logger.warn('%s: out_map is empty', name)
continue
# Copy the inputs over, if they were marked as destroyed or viewed # Copy the inputs over, if they were marked as destroyed or viewed
# (we will destroy the output at some point so it can destroy # (we will destroy the output at some point so it can destroy
# the input) # the input)
dmap = getattr(node.op, 'destroy_map', {}) for r in aliased_inputs:
vmap = getattr(node.op, 'view_map', {}) storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
for i, r in enumerate(node.inputs):
if any(i in v for v in (dmap.values() + vmap.values())):
storage_map[r][0] = _lessbroken_deepcopy(r_vals[r])
# Get the appropriate output storages # Get the appropriate output storages
# (no copy) # (no copy)
...@@ -1724,11 +1769,11 @@ class _Linker(gof.link.LocalLinker): ...@@ -1724,11 +1769,11 @@ class _Linker(gof.link.LocalLinker):
raise InvalidValueError(r, storage_map[r][0], raise InvalidValueError(r, storage_map[r][0],
hint='perform output', hint='perform output',
specific_hint=hint2) specific_hint=hint2)
py_inplace_outs = _check_inputs(
_check_inputs(node, storage_map, r_vals, dr_vals, node, storage_map, r_vals, dr_vals,
active_order_set, active_order_set,
clobber_dr_vals=True, perform='py', clobber_dr_vals=True, perform='py',
warn_input_not_reused=config.DebugMode.warn_input_not_reused) warn_input_not_reused=config.DebugMode.warn_input_not_reused)
_check_viewmap(node, storage_map) _check_viewmap(node, storage_map)
...@@ -1756,7 +1801,8 @@ class _Linker(gof.link.LocalLinker): ...@@ -1756,7 +1801,8 @@ class _Linker(gof.link.LocalLinker):
r_vals=r_vals, r_vals=r_vals,
dr_vals=dr_vals, dr_vals=dr_vals,
perform='py', perform='py',
active_order_set=active_order_set) active_order_set=active_order_set,
inplace_outs=py_inplace_outs)
# print >> sys.stderr, i, "DEBUGMODE thunk_py %100s %50s %30s" % (node, # print >> sys.stderr, i, "DEBUGMODE thunk_py %100s %50s %30s" % (node,
#[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs], #[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs],
...@@ -1805,10 +1851,11 @@ class _Linker(gof.link.LocalLinker): ...@@ -1805,10 +1851,11 @@ class _Linker(gof.link.LocalLinker):
self.maker.mode.require_matching_strides, self.maker.mode.require_matching_strides,
node.op) node.op)
_check_inputs(node, storage_map, r_vals, c_inplace_outs = _check_inputs(
dr_vals, active_order_set, node, storage_map, r_vals,
clobber_dr_vals=clobber, perform='c', dr_vals, active_order_set,
warn_input_not_reused=config.DebugMode.warn_input_not_reused) clobber_dr_vals=clobber, perform='c',
warn_input_not_reused=config.DebugMode.warn_input_not_reused)
_check_viewmap(node, storage_map) _check_viewmap(node, storage_map)
...@@ -1848,7 +1895,8 @@ class _Linker(gof.link.LocalLinker): ...@@ -1848,7 +1895,8 @@ class _Linker(gof.link.LocalLinker):
r_vals=r_vals, r_vals=r_vals,
dr_vals=dr_vals, dr_vals=dr_vals,
perform='c code', perform='c code',
active_order_set=active_order_set) active_order_set=active_order_set,
inplace_outs=c_inplace_outs)
# print >> sys.stderr, i, "DEBUGMODE thunk_c %100s %50s %30s" % (node, # print >> sys.stderr, i, "DEBUGMODE thunk_c %100s %50s %30s" % (node,
#[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs], #[(id(o), numpy.asarray(storage_map[o][0])[0,0]) for o in node.inputs],
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论