Merge pull request #4996 from Thrandis/ccw

Scan with Checkpoints (part 2)

Merge pull request #4996 from Thrandis/ccw
418a5f1b · Frédéric Bastien · GitHub · a29c31d3 · dd580bba · 418a5f1b
--- a/doc/library/scan.txt
+++ b/doc/library/scan.txt
@@ -529,6 +529,40 @@ As a rule, scan always expects the condition to be the last thing returned
 by the inner function, otherwise an error will be raised.
+Reducing Scan's memory usage
+----------------------------
+This section presents the ``scan_checkpoints`` function. In short, this
+function reduces the memory usage of scan (at the cost of more computation
+time) by not keeping in memory all the intermediate time steps of the loop,
+and recomputing them when computing the gradients. This function is therefore
+only useful if you need to compute the gradient of the ouptut of scan with
+respect to its inputs, and shouldn't be used otherwise.
+Before going more into the details, here are its current limitations:
+* It only works in the case where only the output of the last time step is
+  needed, like when computing ``A**k`` or in an `encoder-decoder` setup.
+* It only accepts sequences of the same length.
+* If ``n_steps`` is specified, it has the same value as the length of any
+  sequences.
+* It is signly-recurrent, meaning that only the previous time step can be used
+  to compute the current one (ie ``h[t]`` can only depend on ``h[t-1]``). In
+  other words, ``taps`` can not be used in ``sequences`` and ``outputs_info``.
+Often, in order to be able to compute the gradients through scan operations,
+Theano needs to keep in memory some intermediate computations of scan. This
+can sometimes use a prohibitively large amount of memory.
+``scan_checkpoints`` allows to discard some of those intermediate steps and
+recompute them again when computing the gradients. Its ``save_every_N`` argument
+specifies the number time steps to do without storing the intermediate results.
+For example, ``save_every_N = 4`` will reduce the memory usage by 4, while having
+to recompute 3/4 time steps of the forward loop. Since the grad of scan is
+about 6x slower than the forward, a ~20% slowdown is expected. Apart from the
+``save_every_N`` argument and the current limitations, the usage of this function
+is similar to the classic ``scan`` function.
 Optimizing Scan's performance
 -----------------------------
@@ -612,4 +646,4 @@ reference
 .. autofunction:: theano.foldl
 .. autofunction:: theano.foldr
 .. autofunction:: theano.scan
+.. autofunction:: theano.scan_checkpoints
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -80,7 +80,8 @@ from theano.misc.safe_asarray import _asarray
 from theano.printing import pprint, pp
-from theano.scan_module import scan, map, reduce, foldl, foldr, clone
+from theano.scan_module import (scan, map, reduce, foldl, foldr, clone,
+                                scan_checkpoints)
 from theano.updates import OrderedUpdates

--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -40,5 +40,6 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 from theano.scan_module import scan_opt
 from theano.scan_module.scan import scan
+from theano.scan_module.scan_checkpoints import scan_checkpoints
 from theano.scan_module.scan_views import map, reduce, foldl, foldr
 from theano.scan_module.scan_utils import clone, until
--- a/theano/scan_module/scan_checkpoints.py
+++ b/theano/scan_module/scan_checkpoints.py
+from __future__ import absolute_import, print_function, division
+import theano
+def scan_checkpoints(fn, sequences=[], outputs_info=None, non_sequences=[],
+                     name="checkpointscan_fn", n_steps=None, save_every_N=10):
+    """Scan function that uses less memory, but is more restrictive.
+    In :func:`~theano.scan`, if you compute the gradient of the output
+    with respect to the input, you will have to store the intermediate
+    results at each time step, which can be prohibitively huge. This
+    function allows to do ``save_every_N`` steps of forward computations
+    without storing the intermediate results, and to recompute them during
+    the gradient computation.
+    Notes
+    -----
+    Current assumptions:
+    * Every sequence has the same length.
+    * If ``n_steps`` is specified, it has the same value as the length of
+      any sequence.
+    * The value of ``save_every_N`` divides the number of steps the scan
+      will run without remainder.
+    * Only singly-recurrent and non-recurrent outputs are used.
+      No multiple recurrences.
+    * Only the last timestep of any output will ever be used.
+    Parameters
+    ----------
+    fn
+        ``fn`` is a function that describes the operations involved in one
+        step of ``scan``. See the documentation of :func:`~theano.scan`
+        for more information.
+    sequences
+        ``sequences`` is the list of Theano variables or dictionaries
+        describing the sequences ``scan`` has to iterate over. All
+        sequences must be the same length in this version of ``scan``.
+    outputs_info
+        ``outputs_info`` is the list of Theano variables or dictionaries
+        describing the initial state of the outputs computed
+        recurrently.
+    non_sequences
+        ``non_sequences`` is the list of arguments that are passed to
+        ``fn`` at each steps. One can opt to exclude variable
+        used in ``fn`` from this list as long as they are part of the
+        computational graph, though for clarity we encourage not to do so.
+    n_steps
+        ``n_steps`` is the number of steps to iterate given as an int
+        or Theano scalar. If any of the input sequences do not have
+        enough elements, scan will raise an error. If the **value is 0**
+        the outputs will have **0 rows**. If the value is negative,
+        ``scan`` will run backwards in time. If the ``go_backwards`` flag
+        is already set and also ``n_steps`` is negative, ``scan`` will run
+        forward in time. If n_steps is not provided, ``scan`` will figure
+        out the amount of steps it should run given its input sequences.
+    save_every_N
+        ``save_every_N`` is the number of steps to go without storing
+        the computations of ``scan`` (ie they will have to be recomputed
+        during the gradient computation).
+    Returns
+    -------
+    tuple
+        Tuple of the form ``(outputs, updates)`` as in :func:`~theano.scan`, but
+        with a small change: It only contain the output at each
+        ``save_every_N`` step. The time steps that are not returned by
+        this function will be recomputed during the gradient computation
+        (if any).
+    See Also
+    --------
+    :func:`~theano.scan`: Looping in Theano.
+    """
+    # Standardize the format of input arguments
+    if not isinstance(sequences, list):
+        sequences = [sequences]
+    if not isinstance(outputs_info, list):
+        outputs_info = [outputs_info]
+    if not isinstance(non_sequences, list):
+        non_sequences = [non_sequences]
+    # Check that outputs_info has no taps:
+    for element in outputs_info:
+        if isinstance(element, dict) and 'taps' in element:
+            raise RuntimeError("scan_checkpoints doesn't work with taps.")
+    # Determine how many steps the original scan would run
+    if n_steps is None:
+        n_steps = sequences[0].shape[0]
+    # Compute the number of steps of the inner and of the outer scan
+    o_n_steps = theano.tensor.cast(n_steps / save_every_N, 'int64')
+    i_n_steps = save_every_N
+    # Establish the input variables of the outer scan
+    o_sequences = [s.reshape([s.shape[0] / save_every_N, save_every_N] +
+                             [s.shape[i] for i in range(1, s.ndim)],
+                             s.ndim + 1) for s in sequences]
+    new_nitsots = [i for i in outputs_info if i is None]
+    o_nonsequences = non_sequences + [i_n_steps]
+    def outer_step(*args):
+        # Separate the received arguments into their respective (seq, outputs
+        # from previous iterations, nonseqs) categories
+        i_sequences = list(args[:len(o_sequences)])
+        i_prev_outputs = list(args[len(o_sequences):-len(o_nonsequences)])
+        i_non_sequences = list(args[-len(o_nonsequences):])
+        i_outputs_infos = i_prev_outputs + [None, ] * len(new_nitsots)
+        # Call the user-provided function with the proper arguments
+        results, updates = theano.scan(fn=fn,
+                                       sequences=i_sequences,
+                                       outputs_info=i_outputs_infos,
+                                       non_sequences=i_non_sequences[:-1],
+                                       name=name + "_inner",
+                                       n_steps=i_non_sequences[-1])
+        if not isinstance(results, list):
+            results = [results]
+        # Keep only the last timestep of every output but keep all the updates
+        if not isinstance(results, list):
+            return results[-1], updates
+        else:
+            return [r[-1] for r in results], updates
+    results, updates = theano.scan(fn=outer_step,
+                                   sequences=o_sequences,
+                                   outputs_info=outputs_info,
+                                   non_sequences=o_nonsequences,
+                                   name=name + "_outer",
+                                   n_steps=o_n_steps, allow_gc=True)
+    return results, updates
--- a/theano/scan_module/tests/test_scan_checkpoints.py
+++ b/theano/scan_module/tests/test_scan_checkpoints.py
+from __future__ import absolute_import, print_function, division
+import numpy
+import unittest
+import theano
+import theano.tensor as T
+try:
+    from pygpu.gpuarray import GpuArrayException
+    PYGPU_AVAILABLE = True
+except ImportError:
+    PYGPU_AVAILABLE = False
+class TestScanCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.k = T.iscalar("k")
+        self.A = T.vector("A")
+        result, _ = theano.scan(
+            fn=lambda prior_result, A: prior_result * A,
+            outputs_info=T.ones_like(self.A),
+            non_sequences=self.A,
+            n_steps=self.k)
+        result_check, _ = theano.scan_checkpoints(
+            fn=lambda prior_result, A: prior_result * A,
+            outputs_info=T.ones_like(self.A),
+            non_sequences=self.A,
+            n_steps=self.k,
+            save_every_N=100)
+        self.result = result[-1]
+        self.result_check = result_check[-1]
+        self.grad_A = T.grad(self.result.sum(), self.A)
+        self.grad_A_check = T.grad(self.result_check.sum(), self.A)
+    def test_forward_pass(self):
+        """Test forward computation of A**k."""
+        f = theano.function(inputs=[self.A, self.k],
+                            outputs=[self.result, self.result_check])
+        out, out_check = f(range(10), 100)
+        assert numpy.allclose(out, out_check)
+    def test_backward_pass(self):
+        """Test gradient computation of A**k."""
+        f = theano.function(inputs=[self.A, self.k],
+                            outputs=[self.grad_A, self.grad_A_check])
+        out, out_check = f(range(10), 100)
+        assert numpy.allclose(out, out_check)
+    @unittest.skipUnless(PYGPU_AVAILABLE, 'Requires pygpu.')
+    def test_memory(self):
+        """Test that scan_checkpoint reduces memory usage."""
+        if None not in theano.gpuarray.type.list_contexts():
+            return unittest.SkipTest('Requires gpuarray backend.')
+        f = theano.function(inputs=[self.A, self.k],
+                            outputs=self.grad_A)
+        f_check = theano.function(inputs=[self.A, self.k],
+                                  outputs=self.grad_A_check)
+        free_gmem = theano.gpuarray.type._context_reg[None].free_gmem
+        data = numpy.ones(free_gmem / 3000, dtype=numpy.float32)
+        # Check that it works with the checkpoints
+        f_check(data, 1000)
+        # Check that the basic scan fails in that case
+        self.assertRaises(GpuArrayException, f, data, 1000)
+    def test_taps_error(self):
+        """Test that an error rises if we use taps in outputs_info."""
+        self.assertRaises(RuntimeError, theano.scan_checkpoints,
+                          lambda: None, [], {'initial': self.A, 'taps': [-2]})