提交 68006606 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

New 'strided' and 'wrong_size' preallocated outputs

Also, create the memory buffers as needed.
上级 5a34781f
...@@ -16,6 +16,7 @@ from theano import gof ...@@ -16,6 +16,7 @@ from theano import gof
from theano.gof import Env, graph, utils, link from theano.gof import Env, graph, utils, link
from theano.gof.link import raise_with_op from theano.gof.link import raise_with_op
from theano.gof.cc import CLinker from theano.gof.cc import CLinker
from theano.gof.python25 import product as itertools_product
from theano.configparser import (config, AddConfigVar, BoolParam, IntParam, from theano.configparser import (config, AddConfigVar, BoolParam, IntParam,
StrParam) StrParam)
from theano.compile.function_module import (FunctionMaker, from theano.compile.function_module import (FunctionMaker,
...@@ -64,7 +65,7 @@ def is_valid_check_preallocated_output_param(param): ...@@ -64,7 +65,7 @@ def is_valid_check_preallocated_output_param(param):
if not isinstance(param, basestring): if not isinstance(param, basestring):
return False return False
valid = ["previous", "c_contiguous", "f_contiguous", valid = ["previous", "c_contiguous", "f_contiguous",
"neg_strides", "ALL", ""] "strided", "wrong_size", "ALL", ""]
for p in param.split(":"): for p in param.split(":"):
if p not in valid: if p not in valid:
return False return False
...@@ -75,7 +76,8 @@ AddConfigVar('DebugMode.check_preallocated_output', ...@@ -75,7 +76,8 @@ AddConfigVar('DebugMode.check_preallocated_output',
'This is a list of strings separated by ":". Valid values are: ' 'This is a list of strings separated by ":". Valid values are: '
'"previous" (previously-returned memory), ' '"previous" (previously-returned memory), '
'"c_contiguous", "f_contiguous", ' '"c_contiguous", "f_contiguous", '
'"neg_strides" (negative strides), and ' '"strided" (positive and negative strides), '
'"wrong_size" (larger and smaller dimensions), and '
'"ALL" (all of the above).'), '"ALL" (all of the above).'),
StrParam('ALL', is_valid=is_valid_check_preallocated_output_param), StrParam('ALL', is_valid=is_valid_check_preallocated_output_param),
in_c_key=False) in_c_key=False)
...@@ -987,19 +989,17 @@ def _find_bad_optimizations2(order, reasons, r_vals): ...@@ -987,19 +989,17 @@ def _find_bad_optimizations2(order, reasons, r_vals):
_find_bad_optimizations = _find_bad_optimizations0 _find_bad_optimizations = _find_bad_optimizations0
def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
storage_map, r_vals, dr_vals, perform, active_order_set): storage_map, r_vals, dr_vals, perform, active_order_set):
'''Try to apply thunk() on different output storages''' '''Preallocate outputs in different memory layouts'''
# To avoid circular imports # To avoid circular imports
from theano.tensor import TensorType from theano.tensor import TensorType
from theano.sandbox.cuda import cuda_available, CudaNdarrayType from theano.sandbox.cuda import cuda_available, CudaNdarrayType
if cuda_available: if cuda_available:
from theano.sandbox.cuda import CudaNdarray from theano.sandbox.cuda import CudaNdarray
from theano.sandbox.cuda import dimshuffle as cuda_dimshuffle
# List of (name, map) pairs of the settings to test
prealloc_maps = []
# TODO: Sparse, Scalar # TODO: Sparse, Scalar
# TODO: wrong shape, more stride patterns # TODO: wrong shape, more stride patterns
...@@ -1015,7 +1015,9 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val, ...@@ -1015,7 +1015,9 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
reuse_outputs[r] = r_vals[r] reuse_outputs[r] = r_vals[r]
r_vals[r] = new_r r_vals[r] = new_r
prealloc_maps.append(('previous', reuse_outputs)) yield ('previous', reuse_outputs)
# clear memory that is not needed any more
del reuse_outputs
# c_cont_output: use a c-continuous array # c_cont_output: use a c-continuous array
# (for TensorType and CudaNdarray, else None) # (for TensorType and CudaNdarray, else None)
...@@ -1034,29 +1036,115 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val, ...@@ -1034,29 +1036,115 @@ def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
c_cont_outputs[r] = new_buf c_cont_outputs[r] = new_buf
if len(c_cont_outputs): if len(c_cont_outputs):
prealloc_maps.append(('c_contiguous', c_cont_outputs)) yield ('c_contiguous', c_cont_outputs)
del c_cont_outputs
# f_cont_output: use a fortran-continuous ndarray # f_cont_output: use a fortran-continuous ndarray
# (for TensorType, only) # (for TensorType, only)
if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes: if 'f_contiguous' in prealloc_modes or 'ALL' in prealloc_modes:
f_cont_outputs = {} f_cont_outputs = {}
for r in node.outputs: for r in node.outputs:
if isinstance(r.type, TensorType): if isinstance(r.type, (TensorType, CudaNdarrayType)):
new_buf = numpy.zeros( new_buf = numpy.zeros(
shape=r_vals[r].shape, shape=r_vals[r].shape,
dtype=r_vals[r].dtype, dtype=r_vals[r].dtype,
order='F') order='F')
new_buf += def_val new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
# When the CudaNdarray is built, the underlying memory
# is c-contiguous, so we transpose it before and after.
new_buf = CudaNdarray(new_buf.T)
new_buf = cuda_dimshuffle(new_buf,
range(new_buf.ndim)[::-1])
f_cont_outputs[r] = new_buf f_cont_outputs[r] = new_buf
if len(f_cont_outputs): if len(f_cont_outputs):
prealloc_maps.append(('f_contiguous', f_cont_outputs)) yield ('f_contiguous', f_cont_outputs)
del f_cont_outputs
# We assume that the different outputs of a same Op will behave
# independantly, and there is no need to test over all combinations
# of outputs (the time taken is prohibitive).
max_ndim = 0
for r in node.outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
max_ndim = max(max_ndim, r.ndim)
if 'strided' in prealloc_modes or 'ALL' in prealloc_modes:
# Initial allocation
init_strided = {}
for r in node.outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Create a buffer twice as large in every dimension
new_buf = numpy.zeros(
shape=[(s * 2) for s in r_vals[r].shape],
dtype=r_vals[r].dtype)
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
init_strided[r] = new_buf
for step_signs in itertools_product((-1, 1), repeat=max_ndim):
for step_size in (1, 2):
strided = {}
steps = [s * step_size for s in step_signs]
name = 'strided%s' % str(tuple(steps))
for r in node.outputs:
if r in init_strided:
# Build lists of slices, for strides and shapes
strides = []
shapes = []
for i, size in enumerate(r_vals[r].shape):
strides.append(slice(None, None, steps[i]))
shapes.append(slice(None, size, None))
r_buf = init_strided[r]
if r_buf.ndim > 0:
r_buf = r_buf[tuple(strides)][tuple(shapes)]
assert r_buf.shape == r_vals[r].shape
if isinstance(r.type, CudaNdarrayType):
# It seems stupid, but we need to allocate a
# new ndarray and copy it into the GPU one.
new_rbuf = numpy.zeros(r_vals[r].shape, dtype=r.dtype)
new_rbuf += def_val
r_buf[...] = CudaNdarray(new_rbuf)
else:
r_buf[...] = def_val
strided[r] = r_buf
if 'neg_strides' in prealloc_maps: yield (name, strided)
raise NotImplementedError('Negative strides in' del strided
' check_preallocated_output')
if 'wrong_size' in prealloc_modes or 'ALL' in prealloc_modes:
# For each dimension, try size-1, size, size+1
for shape_diff in itertools_product((-1, 0, 1), repeat=max_ndim):
wrong_size = {}
name='wrong_size%s' % str(tuple(shape_diff))
for r in node.outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)):
r_shape_diff = shape_diff[:r.ndim]
out_shape = [max((s + sd), 0)
for s, sd in zip(r_vals[r].shape, r_shape_diff)]
new_buf = numpy.zeros(
shape=out_shape,
dtype=r.dtype)
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
wrong_size[r] = new_buf
for (name, out_map) in prealloc_maps: yield (name, wrong_size)
del wrong_size
def _check_preallocated_output(node, thunk, prealloc_modes, def_val,
storage_map, r_vals, dr_vals, perform, active_order_set):
'''Try to apply thunk() on different output storages'''
for (name, out_map) in _get_preallocated_maps(node, thunk, prealloc_modes,
def_val, storage_map, r_vals, dr_vals, perform, active_order_set):
# _logger.debug('name = %s, perform = %s', name, perform) # _logger.debug('name = %s, perform = %s', name, perform)
# Copy the inputs over again # Copy the inputs over again
for r in node.inputs: for r in node.inputs:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论