提交 7c4871ce authored 作者: Brandon T. Willard's avatar Brandon T. Willard 提交者: Brandon T. Willard

Remove non-sequence settings from ScanInfo

上级 6f685799
......@@ -1033,6 +1033,13 @@ def scan(
n_sit_sot=n_sit_sot,
n_shared_outs=n_shared_outs,
n_nit_sot=n_nit_sot,
)
local_op = Scan(
inner_inputs,
new_outs,
info,
mode=mode,
truncate_gradient=truncate_gradient,
name=name,
gpua=False,
......@@ -1042,8 +1049,6 @@ def scan(
strict=strict,
)
local_op = Scan(inner_inputs, new_outs, info, mode)
##
# Step 8. Compute the outputs using the scan op
##
......
......@@ -100,13 +100,6 @@ class ScanInfo:
n_sit_sot: int
n_shared_outs: int
n_nit_sot: int
truncate_gradient: bool = False
name: Optional[str] = None
gpua: bool = False
as_while: bool = False
profile: Optional[Union[str, bool]] = None
allow_gc: bool = True
strict: bool = True
TensorConstructorType = Callable[[List[bool], Union[str, np.generic]], TensorType]
......@@ -466,7 +459,7 @@ class ScanMethodsMixin:
# output with type GpuArrayType
from aesara.gpuarray import GpuArrayType
if not self.info.gpua:
if not self.gpua:
for inp in self.inputs:
if isinstance(inp.type, GpuArrayType):
raise TypeError(
......@@ -496,6 +489,13 @@ class Scan(Op, ScanMethodsMixin):
info: ScanInfo,
mode: Optional[Mode] = None,
typeConstructor: Optional[TensorConstructorType] = None,
truncate_gradient: bool = False,
name: Optional[str] = None,
gpua: bool = False,
as_while: bool = False,
profile: Optional[Union[str, bool]] = None,
allow_gc: bool = True,
strict: bool = True,
):
r"""
......@@ -506,33 +506,85 @@ class Scan(Op, ScanMethodsMixin):
outputs
Outputs of the inner function of `Scan`.
info
Dictionary containing different properties of the `Scan` `Op` (like
number of different types of arguments, name, mode, if it should run on
GPU or not, etc.).
A collection of information about the sequences and taps.
mode
The compilation mode for the inner graph.
The mode used to compile the inner-graph.
If you prefer the computations of one step of `scan` to be done
differently then the entire function, you can use this parameter to
describe how the computations in this loop are done (see
`aesara.function` for details about possible values and their meaning).
typeConstructor
Function that constructs an equivalent to Aesara `TensorType`.
Function that constructs a `TensorType` for the outputs.
truncate_gradient
`truncate_gradient` is the number of steps to use in truncated
back-propagation through time (BPTT). If you compute gradients through
a `Scan` `Op`, they are computed using BPTT. By providing a different
value then ``-1``, you choose to use truncated BPTT instead of classical
BPTT, where you go for only `truncate_gradient` number of steps back in
time.
name
When profiling `scan`, it is helpful to provide a name for any
instance of `scan`.
For example, the profiler will produce an overall profile of your code
as well as profiles for the computation of one step of each instance of
`Scan`. The `name` of the instance appears in those profiles and can
greatly help to disambiguate information.
gpua
If ``True``, this `Op` should run on a GPU.
as_while
Whether or not the `Scan` is a ``while``-loop.
profile
If ``True`` or a non-empty string, a profile object will be created and
attached to the inner graph of `Scan`. When `profile` is ``True``, the
profiler results will use the name of the `Scan` instance, otherwise it
will use the passed string. The profiler only collects and prints
information when running the inner graph with the `CVM` `Linker`.
allow_gc
Set the value of `allow_gc` for the internal graph of the `Scan`. If
set to ``None``, this will use the value of
`aesara.config.scan__allow_gc`.
The full `Scan` behavior related to allocation is determined by this
value and the flag `aesara.config.allow_gc`. If the flag
`allow_gc` is ``True`` (default) and this `allow_gc` is ``False``
(default), then we let `Scan` allocate all intermediate memory
on the first iteration, and they are not garbage collected
after that first iteration; this is determined by `allow_gc`. This can
speed up allocation of the subsequent iterations. All those temporary
allocations are freed at the end of all iterations; this is what the
flag `aesara.config.allow_gc` means.
If you use pre-allocation and this `Scan` is on GPU, the speed up from
`allow_gc` is small. If you are missing memory, disabling `allow_gc`
could help you run graph that request much memory.
strict
If ``True``, all the shared variables used in the inner-graph must be provided.
Notes
-----
`typeConstructor` had been added to refactor how
Aesara deals with the GPU. If it runs on the GPU, scan needs
to construct certain outputs (those who reside in the GPU
memory) as the GPU-specific type. However we can not import
gpu code in this file (as it is in sandbox, and not available
on each machine) so the workaround is that the GPU
optimization passes to the constructor of this class a
function that is able to construct a GPU type. This way the
class `Scan` does not need to be aware of the details for the
GPU, it just constructs any tensor using this function (which
by default constructs normal tensors).
`typeConstructor` had been added to refactor how Aesara deals with the
GPU. If it runs on the GPU, `Scan` needs to construct certain outputs
(those that reside in GPU memory) as the GPU-specific `Type`. Since we
cannot import GPU code here, the GPU optimizations pass the constructor
of this class a function that is able to construct a GPU `Type`. This
way the class `Scan` does not need to be aware of the GPU details--it
simply constructs tensors using this function (which by default
constructs normal tensors).
TODO: Clean up this approach and everything else related to GPUs; it's
all currently a very leaky set of abstractions.
"""
# adding properties into self
self.inputs = inputs
self.outputs = outputs
self.info = info
self.truncate_gradient = truncate_gradient
self.name = name
self.gpua = gpua
self.as_while = as_while
self.profile = profile
self.allow_gc = allow_gc
self.strict = strict
self.__dict__.update(dataclasses.asdict(info))
# Clone mode_instance, altering "allow_gc" for the linker,
......@@ -591,11 +643,6 @@ class Scan(Op, ScanMethodsMixin):
if not hasattr(self, "name") or self.name is None:
self.name = "scan_fn"
# to have a fair __eq__ comparison later on, we update the info with
# the actual mode used to compile the function and the name of the
# function that we set in case none was given
self.info = dataclasses.replace(self.info, name=self.name)
# Pre-computing some values to speed up perform
self.mintaps = [np.min(x) for x in self.tap_array]
self.mintaps += [0 for x in range(self.n_nit_sot)]
......@@ -606,8 +653,9 @@ class Scan(Op, ScanMethodsMixin):
self.nit_sot_arg_offset = self.shared_arg_offset + self.n_shared_outs
self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
if self.info.gpua:
self._hash_inner_graph = self.info.gpu_hash
if self.gpua:
self._hash_inner_graph = self.gpu_hash
else:
# Do the missing inputs check here to have the error early.
for var in graph_inputs(self.outputs, self.inputs):
......@@ -1072,6 +1120,24 @@ class Scan(Op, ScanMethodsMixin):
if self.info != other.info:
return False
if self.gpua != other.gpua:
return False
if self.as_while != other.as_while:
return False
if self.profile != other.profile:
return False
if self.truncate_gradient != other.truncate_gradient:
return False
if self.name != other.name:
return False
if self.allow_gc != other.allow_gc:
return False
# Compare inner graphs
# TODO: Use `self.inner_fgraph == other.inner_fgraph`
if len(self.inputs) != len(other.inputs):
......@@ -1115,7 +1181,19 @@ class Scan(Op, ScanMethodsMixin):
return aux_txt
def __hash__(self):
return hash((type(self), self._hash_inner_graph, self.info))
return hash(
(
type(self),
self._hash_inner_graph,
self.info,
self.gpua,
self.as_while,
self.profile,
self.truncate_gradient,
self.name,
self.allow_gc,
)
)
def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
"""
......@@ -2661,18 +2739,12 @@ class Scan(Op, ScanMethodsMixin):
n_seqs=len(outer_inp_seqs),
n_mit_sot=0,
tap_array=tuple(tuple(v) for v in new_tap_array),
gpua=False,
n_mit_mot=len(outer_inp_mitmot),
n_mit_mot_outs=n_mitmot_outs,
mit_mot_out_slices=tuple(tuple(v) for v in mitmot_out_taps),
truncate_gradient=self.truncate_gradient,
n_sit_sot=n_sitsot_outs,
n_shared_outs=0,
n_nit_sot=n_nit_sot,
as_while=False,
profile=self.profile,
name=f"grad_of_{self.name}" if self.name else None,
allow_gc=self.allow_gc,
)
outer_inputs = (
......@@ -2694,7 +2766,18 @@ class Scan(Op, ScanMethodsMixin):
)
inner_gfn_outs = inner_out_mitmot + inner_out_sitsot + inner_out_nitsot
local_op = Scan(inner_gfn_ins, inner_gfn_outs, info, self.mode)
local_op = Scan(
inner_gfn_ins,
inner_gfn_outs,
info,
mode=self.mode,
truncate_gradient=self.truncate_gradient,
gpua=False,
as_while=False,
profile=self.profile,
name=f"grad_of_{self.name}" if self.name else None,
allow_gc=self.allow_gc,
)
outputs = local_op(*outer_inputs)
if type(outputs) not in (list, tuple):
outputs = [outputs]
......@@ -3013,17 +3096,22 @@ class Scan(Op, ScanMethodsMixin):
n_nit_sot=self.n_nit_sot * 2,
n_shared_outs=self.n_shared_outs,
n_mit_mot_outs=n_mit_mot_outs * 2,
tap_array=tuple(tuple(v) for v in new_tap_array),
mit_mot_out_slices=tuple(tuple(v) for v in self.mit_mot_out_slices) * 2,
)
local_op = Scan(
inner_ins,
inner_outs,
info,
mode=self.mode,
gpua=False,
as_while=self.as_while,
profile=self.profile,
truncate_gradient=self.truncate_gradient,
name=f"rop_of_{self.name}" if self.name else None,
allow_gc=self.allow_gc,
tap_array=tuple(tuple(v) for v in new_tap_array),
mit_mot_out_slices=tuple(tuple(v) for v in self.mit_mot_out_slices) * 2,
)
local_op = Scan(inner_ins, inner_outs, info, self.mode)
outputs = local_op(*scan_inputs)
if type(outputs) not in (list, tuple):
outputs = [outputs]
......
......@@ -217,7 +217,19 @@ def remove_constants_and_unused_inputs_scan(fgraph, node):
if len(nw_inner) != len(op_ins):
op_outs = clone_replace(op_outs, replace=givens)
nw_info = dataclasses.replace(op.info, n_seqs=nw_n_seqs)
nwScan = Scan(nw_inner, op_outs, nw_info, op.mode)
nwScan = Scan(
nw_inner,
op_outs,
nw_info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
# TODO: This seems questionable
name=op.name,
allow_gc=op.allow_gc,
)
nw_outs = nwScan(*nw_outer, return_list=True)
return dict([("remove", [node])] + list(zip(node.outputs, nw_outs)))
else:
......@@ -396,7 +408,19 @@ class PushOutNonSeqScan(GlobalOptimizer):
op_ins = clean_inputs + nw_inner
# Reconstruct node
nwScan = Scan(op_ins, op_outs, op.info, op.mode)
nwScan = Scan(
op_ins,
op_outs,
op.info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
# TODO: This seems questionable
name=op.name,
allow_gc=op.allow_gc,
)
# Do not call make_node for test_value
nw_node = nwScan(*(node.inputs + nw_outer), return_list=True)[0].owner
......@@ -666,7 +690,19 @@ class PushOutSeqScan(GlobalOptimizer):
nw_info = dataclasses.replace(
op.info, n_seqs=op.info.n_seqs + len(nw_inner)
)
nwScan = Scan(op_ins, op_outs, nw_info, op.mode)
nwScan = Scan(
op_ins,
op_outs,
nw_info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
# TODO: This seems questionable
name=op.name,
allow_gc=op.allow_gc,
)
# Do not call make_node for test_value
nw_node = nwScan(
*(node.inputs[:1] + nw_outer + node.inputs[1:]),
......@@ -751,7 +787,9 @@ class PushOutScanOutput(GlobalOptimizer):
# Use `ScanArgs` to parse the inputs and outputs of scan for ease of
# use
args = ScanArgs(node.inputs, node.outputs, op.inputs, op.outputs, op.info)
args = ScanArgs(
node.inputs, node.outputs, op.inputs, op.outputs, op.info, op.as_while
)
new_scan_node = None
clients = {}
......@@ -921,6 +959,7 @@ class PushOutScanOutput(GlobalOptimizer):
new_scan_node.op.inputs,
new_scan_node.op.outputs,
new_scan_node.op.info,
new_scan_node.op.as_while,
)
new_outs = new_scan_args.outer_out_nit_sot[-len(add_as_nitsots) :]
......@@ -952,7 +991,14 @@ class PushOutScanOutput(GlobalOptimizer):
new_scan_args.inner_inputs,
new_scan_args.inner_outputs,
new_scan_args.info,
old_scan_node.op.mode,
mode=old_scan_node.op.mode,
gpua=old_scan_node.op.gpua,
as_while=old_scan_node.op.as_while,
profile=old_scan_node.op.profile,
truncate_gradient=old_scan_node.op.truncate_gradient,
# TODO: This seems questionable
name=old_scan_node.op.name,
allow_gc=old_scan_node.op.allow_gc,
)
# Create the Apply node for the scan op
......@@ -1059,7 +1105,18 @@ class ScanInplaceOptimizer(GlobalOptimizer):
typeConstructor = self.typeInfer(node)
new_op = Scan(
op.inputs, op.outputs, op.info, op.mode, typeConstructor=typeConstructor
op.inputs,
op.outputs,
op.info,
mode=op.mode,
typeConstructor=typeConstructor,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
# TODO: This seems questionable
name=op.name,
allow_gc=op.allow_gc,
)
destroy_map = op.destroy_map.copy()
......@@ -1086,9 +1143,7 @@ class ScanInplaceOptimizer(GlobalOptimizer):
alloc_ops = (Alloc, AllocEmpty)
nodes = fgraph.toposort()[::-1]
scan_nodes = [
x
for x in nodes
if (isinstance(x.op, Scan) and x.op.info.gpua == self.gpua_flag)
x for x in nodes if (isinstance(x.op, Scan) and x.op.gpua == self.gpua_flag)
]
for scan_idx in range(len(scan_nodes)):
......@@ -1593,7 +1648,20 @@ class ScanSaveMem(GlobalOptimizer):
return
# Do not call make_node for test_value
new_outs = Scan(inps, outs, info, op.mode)(*node_ins, return_list=True)
new_op = Scan(
inps,
outs,
info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
# TODO: This seems questionable
name=op.name,
allow_gc=op.allow_gc,
)
new_outs = new_op(*node_ins, return_list=True)
old_new = []
# 3.7 Get replace pairs for those outputs that do not change
......@@ -1871,15 +1939,21 @@ class ScanMerge(GlobalOptimizer):
n_sit_sot=sum([nd.op.n_sit_sot for nd in nodes]),
n_shared_outs=sum([nd.op.n_shared_outs for nd in nodes]),
n_nit_sot=sum([nd.op.n_nit_sot for nd in nodes]),
truncate_gradient=nodes[0].op.truncate_gradient,
)
old_op = nodes[0].op
new_op = Scan(
new_inner_ins,
new_inner_outs,
info,
mode=old_op.mode,
profile=old_op.profile,
truncate_gradient=old_op.truncate_gradient,
allow_gc=old_op.allow_gc,
name="&".join([nd.op.name for nd in nodes]),
gpua=False,
as_while=as_while,
profile=nodes[0].op.profile,
allow_gc=nodes[0].op.allow_gc,
)
new_op = Scan(new_inner_ins, new_inner_outs, info, nodes[0].op.mode)
new_outs = new_op(*outer_ins)
if not isinstance(new_outs, (list, tuple)):
......@@ -2005,7 +2079,12 @@ def scan_merge_inouts(fgraph, node):
# Equivalent inputs will be stored in inp_equiv, then a new
# scan node created without duplicates.
a = ScanArgs(
node.inputs, node.outputs, node.op.inputs, node.op.outputs, node.op.info
node.inputs,
node.outputs,
node.op.inputs,
node.op.outputs,
node.op.info,
node.op.as_while,
)
inp_equiv = {}
......@@ -2044,13 +2123,32 @@ def scan_merge_inouts(fgraph, node):
a_inner_outs = a.inner_outputs
inner_outputs = clone_replace(a_inner_outs, replace=inp_equiv)
op = Scan(inner_inputs, inner_outputs, info, node.op.mode)
outputs = op(*outer_inputs)
new_op = Scan(
inner_inputs,
inner_outputs,
info,
mode=node.op.mode,
gpua=node.op.gpua,
as_while=node.op.as_while,
profile=node.op.profile,
truncate_gradient=node.op.truncate_gradient,
# TODO: This seems questionable
name=node.op.name,
allow_gc=node.op.allow_gc,
)
outputs = new_op(*outer_inputs)
if not isinstance(outputs, (list, tuple)):
outputs = [outputs]
na = ScanArgs(outer_inputs, outputs, op.inputs, op.outputs, op.info)
na = ScanArgs(
outer_inputs,
outputs,
new_op.inputs,
new_op.outputs,
new_op.info,
new_op.as_while,
)
remove = [node]
else:
na = a
......@@ -2302,7 +2400,19 @@ class PushOutDot1(GlobalOptimizer):
new_inner_inps, new_inner_outs = reconstruct_graph(
_new_inner_inps, _new_inner_outs
)
new_op = Scan(new_inner_inps, new_inner_outs, new_info, op.mode)
new_op = Scan(
new_inner_inps,
new_inner_outs,
new_info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
# TODO: This seems questionable
name=op.name,
allow_gc=op.allow_gc,
)
_scan_inputs = (
[node.inputs[0]]
+ outer_seqs
......
......@@ -701,12 +701,6 @@ def compress_outs(op, not_required, inputs):
n_sit_sot=0,
n_shared_outs=0,
n_nit_sot=0,
truncate_gradient=op.info.truncate_gradient,
name=op.info.name,
gpua=op.info.gpua,
as_while=op.info.as_while,
profile=op.info.profile,
allow_gc=op.info.allow_gc,
)
op_inputs = op.inputs[: op.n_seqs]
......@@ -886,16 +880,18 @@ class ScanArgs:
_inner_inputs,
_inner_outputs,
info,
as_while,
clone=True,
):
self.n_steps = outer_inputs[0]
self.as_while = as_while
if clone:
rval = reconstruct_graph(_inner_inputs, _inner_outputs, "")
else:
rval = (_inner_inputs, _inner_outputs)
if info.as_while:
if self.as_while:
self.cond = [rval[1][-1]]
inner_outputs = rval[1][:-1]
else:
......@@ -1000,18 +996,6 @@ class ScanArgs:
assert p == len(outer_outputs)
assert q == len(inner_outputs)
self.other_info = {
k: getattr(info, k)
for k in (
"truncate_gradient",
"name",
"gpua",
"as_while",
"profile",
"allow_gc",
)
}
@staticmethod
def from_node(node, clone=False):
from aesara.scan.op import Scan
......@@ -1024,6 +1008,7 @@ class ScanArgs:
node.op.inputs,
node.op.outputs,
node.op.info,
node.op.as_while,
clone=clone,
)
......@@ -1041,14 +1026,8 @@ class ScanArgs:
n_shared_outs=0,
n_mit_mot_outs=0,
mit_mot_out_slices=(),
truncate_gradient=-1,
name=None,
gpua=False,
as_while=False,
profile=False,
allow_gc=False,
)
res = cls([1], [], [], [], info)
res = cls([1], [], [], [], info, False)
res.n_steps = None
return res
......@@ -1152,7 +1131,6 @@ class ScanArgs:
n_shared_outs=len(self.outer_in_shared),
n_mit_mot_outs=sum(len(s) for s in self.mit_mot_out_slices),
mit_mot_out_slices=tuple(self.mit_mot_out_slices),
**self.other_info,
)
def get_alt_field(self, var_info, alt_prefix):
......@@ -1341,7 +1319,6 @@ class ScanArgs:
"mit_mot_out_slices",
"mit_mot_in_slices",
"mit_sot_in_slices",
"other_info",
)
):
setattr(res, attr, copy.copy(getattr(self, attr)))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论