Merge pull request #2802 from dmitriy-serdyuk/pickling

WIP: Custom pickler for shared variables

Merge pull request #2802 from dmitriy-serdyuk/pickling
2907f95a · Bart van Merriënboer · 67783f2e · 804d0b6c · 2907f95a · 2907f95a
--- a/doc/library/index.txt
+++ b/doc/library/index.txt
@@ -20,6 +20,7 @@ Types and Ops that you can use to build and compile expression graphs.
   sparse/sandbox
   scalar/index
   gof/index
+   misc/pkl_utils
   scan
   sandbox/index
   typed_list

--- a/doc/library/misc/pkl_utils.txt
+++ b/doc/library/misc/pkl_utils.txt
+.. _libdoc_misc:
+================================================
+:mod:`misc.pkl_utils` - Tools for serialization.
+================================================
+.. autofunction:: theano.misc.pkl_utils.dump
+.. autofunction:: theano.misc.pkl_utils.load
+.. seealso::
+    :ref:`tutorial_loadsave`
--- a/doc/tutorial/loading_and_saving.txt
+++ b/doc/tutorial/loading_and_saving.txt
@@ -114,6 +114,33 @@ For instance, you can define functions along the lines of:
        self.training_set = cPickle.load(file(self.training_set_file, 'rb'))
+Robust Serialization
+====================
+This type of serialization uses some helper functions particular to Theano. It
+serializes the object using Python's pickling protocol, but any ``ndarray`` or
+``CudaNdarray`` objects contained within the object are saved separately as NPY
+files. These NPY files and the Pickled file are all saved together in single
+ZIP-file.
+The main advantage of this approach is that you don't even need Theano installed
+in order to look at the values of shared variables that you pickled. You can
+just load the parameters manually with `numpy`.
+.. code-block:: python
+    numpy.load('model.zip')
+This approach could be beneficial if you are sharing your model with people who
+might not have Theano installed, who are using a different Python version, or if
+you are planning to save your model for a long time (in which case version
+mismatches might make it difficult to unpickle objects).
+.. autofunction:: theano.misc.pkl_utils.dump
+.. autofunction:: theano.misc.pkl_utils.load
 Long-Term Serialization
 =======================
@@ -154,4 +181,3 @@ functions to reflect the change in name:
 For more information on advanced use of ``pickle`` and its internals, see Python's
 pickle_ documentation.
--- a/theano/misc/pkl_utils.py
+++ b/theano/misc/pkl_utils.py
@@ -4,11 +4,32 @@ Utility classes and methods to pickle parts of symbolic graph.
 These pickled graphs can be used, for instance, as cases for
 unit tests or regression tests.
 """
+import numpy
+import os
 import pickle
 import sys
+import tempfile
+import zipfile
+import warnings
+from collections import defaultdict
+from contextlib import closing
+from pickle import HIGHEST_PROTOCOL
+from theano.compat.six import BytesIO
+try:
+    from pickle import DEFAULT_PROTOCOL
+except ImportError:
+    DEFAULT_PROTOCOL = HIGHEST_PROTOCOL
 import theano
+from theano import config
 from theano.compat import PY3
 from theano.compat.six import string_types
+from theano.compile.sharedvalue import SharedVariable
+try:
+    from theano.sandbox.cuda import cuda_ndarray
+except ImportError:
+    cuda_ndarray = None
 __docformat__ = "restructuredtext en"
 __authors__ = "Pascal Lamblin"
@@ -93,3 +114,232 @@ if PY3:
 else:
    class CompatUnpickler(pickle.Unpickler):
        pass
+class PersistentNdarrayID(object):
+    """Persist ndarrays in an object by saving them to a zip file.
+    :param zip_file: A zip file handle that the NumPy arrays will be saved to.
+    :type zip_file: :class:`zipfile.ZipFile`
+    .. note:
+        The convention for persistent ids given by this class and its derived
+        classes is that the name should take the form `type.name` where `type`
+        can be used by the persistent loader to determine how to load the
+        object, while `name` is human-readable and as descriptive as possible.
+    """
+    def __init__(self, zip_file):
+        self.zip_file = zip_file
+        self.count = 0
+        self.seen = {}
+    def _resolve_name(self, obj):
+        """Determine the name the object should be saved under."""
+        name = 'array_{0}'.format(self.count)
+        self.count += 1
+        return name
+    def __call__(self, obj):
+        if type(obj) is numpy.ndarray:
+            if id(obj) not in self.seen:
+                def write_array(f):
+                    numpy.lib.format.write_array(f, obj)
+                name = self._resolve_name(obj)
+                zipadd(write_array, self.zip_file, name)
+                self.seen[id(obj)] = 'ndarray.{0}'.format(name)
+            return self.seen[id(obj)]
+class PersistentCudaNdarrayID(PersistentNdarrayID):
+    def __init__(self, zip_file):
+        super(PersistentCudaNdarrayID, self).__init__(zip_file)
+    def __call__(self, obj):
+        if (cuda_ndarray is not None and
+                type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray):
+            if id(obj) not in self.seen:
+                def write_array(f):
+                    numpy.lib.format.write_array(f, numpy.asarray(obj))
+                name = self._resolve_name(obj)
+                zipadd(write_array, self.zip_file, name)
+                self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name)
+            return self.seen[id(obj)]
+        super(PersistentCudaNdarrayID, self).__call__(obj)
+class PersistentSharedVariableID(PersistentCudaNdarrayID):
+    """Uses shared variable names when persisting to zip file.
+    If a shared variable has a name, this name is used as the name of the
+    NPY file inside of the zip file. NumPy arrays that aren't matched to a
+    shared variable are persisted as usual (i.e. `array_0`, `array_1`,
+    etc.)
+    :param allow_unnamed: Allow shared variables without a name to be
+        persisted. Defaults to ``True``.
+    :type allow_unnamed: bool, optional
+    :param allow_duplicates: Allow multiple shared variables to have the same
+        name, in which case they will be numbered e.g. `x`, `x_2`, `x_3`, etc.
+        Defaults to ``True``.
+    :type allow_duplicates: bool, optional
+    :raises ValueError
+        If an unnamed shared variable is encountered and `allow_unnamed` is
+        ``False``, or if two shared variables have the same name, and
+        `allow_duplicates` is ``False``.
+    """
+    def __init__(self, zip_file, allow_unnamed=True, allow_duplicates=True):
+        super(PersistentSharedVariableID, self).__init__(zip_file)
+        self.name_counter = defaultdict(int)
+        self.ndarray_names = {}
+        self.allow_unnamed = allow_unnamed
+        self.allow_duplicates = allow_duplicates
+    def _resolve_name(self, obj):
+        if id(obj) in self.ndarray_names:
+            name = self.ndarray_names[id(obj)]
+            count = self.name_counter[name]
+            if count:
+                if not self.allow_duplicates:
+                    raise ValueError("multiple shared variables with the name "
+                                     "`{0}` found".format(name))
+                name = '{0}_{1}'.format(name, count + 1)
+            self.name_counter[name] += 1
+            return name
+        return super(PersistentSharedVariableID, self)._resolve_name(obj)
+    def __call__(self, obj):
+        if isinstance(obj, SharedVariable):
+            if obj.name:
+                if obj.name == 'pkl':
+                    ValueError("can't pickle shared variable with name `pkl`")
+                self.ndarray_names[id(obj.container.storage[0])] = obj.name
+            elif not self.allow_unnamed:
+                raise ValueError("unnamed shared variable, {0}".format(obj))
+        return super(PersistentSharedVariableID, self).__call__(obj)
+class PersistentNdarrayLoad(object):
+    """Load NumPy arrays that were persisted to a zip file when pickling.
+    :param zip_file: The zip file handle in which the NumPy arrays are saved.
+    :type zip_file: :class:`zipfile.ZipFile`
+    """
+    def __init__(self, zip_file):
+        self.zip_file = zip_file
+    def __call__(self, persid):
+        array_type, name = persid.split('.')
+        array = numpy.lib.format.read_array(self.zip_file.open(name))
+        if array_type == 'cuda_ndarray':
+            if config.experimental.unpickle_gpu_on_cpu:
+                # directly return numpy array
+                warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
+                              "to True. Unpickling CudaNdarray as "
+                              "numpy.ndarray")
+                return array
+            elif cuda_ndarray:
+                return cuda_ndarray.cuda_ndarray.CudaNdarray(array)
+            else:
+                raise ImportError("Cuda not found. Cannot unpickle "
+                                  "CudaNdarray")
+        else:
+            return array
+def dump(obj, file_handler, protocol=DEFAULT_PROTOCOL,
+         persistent_id=PersistentSharedVariableID):
+    """Pickles an object to a zip file using external persistence.
+    :param obj: The object to pickle.
+    :type obj: object
+    :param file_handler: The file handle to save the object to.
+    :type file_handler: file
+    :param protocol: The pickling protocol to use. Unlike Python's built-in
+        pickle, the default is set to `2` instead of 0 for Python 2. The
+        Python 3 default (level 3) is maintained.
+    :type protocol: int, optional
+    :param persistent_id: The callable that persists certain objects in the
+        object hierarchy to separate files inside of the zip file. For example,
+        :class:`PersistentNdarrayID` saves any :class:`numpy.ndarray` to a
+        separate NPY file inside of the zip file.
+    :type persistent_id: callable
+    .. note::
+        The final file is simply a zipped file containing at least one file,
+        `pkl`, which contains the pickled object. It can contain any other
+        number of external objects. Note that the zip files are compatible with
+        NumPy's :func:`numpy.load` function.
+    >>> import theano
+    >>> foo_1 = theano.shared(0, name='foo')
+    >>> foo_2 = theano.shared(1, name='foo')
+    >>> with open('model.zip', 'w') as f:
+    ...     dump((foo_1, foo_2, numpy.array(2)), f)
+    >>> numpy.load('model.zip').keys()
+    ['foo', 'foo_2', 'array_0', 'pkl']
+    >>> numpy.load('model.zip')['foo']
+    array(0)
+    >>> with open('model.zip') as f:
+    ...     foo_1, foo_2, array = load(f)
+    >>> array
+    array(2)
+    """
+    with closing(zipfile.ZipFile(file_handler, 'w', zipfile.ZIP_DEFLATED,
+                                 allowZip64=True)) as zip_file:
+        def func(f):
+            p = pickle.Pickler(f, protocol=protocol)
+            p.persistent_id = persistent_id(zip_file)
+            p.dump(obj)
+        zipadd(func, zip_file, 'pkl')
+def load(f, persistent_load=PersistentNdarrayLoad):
+    """Load a file that was dumped to a zip file.
+    :param f: The file handle to the zip file to load the object from.
+    :type f: file
+    :param persistent_load: The persistent loading function to use for
+        unpickling. This must be compatible with the `persisten_id` function
+        used when pickling.
+    :type persistent_load: callable, optional
+    """
+    with closing(zipfile.ZipFile(f, 'r')) as zip_file:
+        p = pickle.Unpickler(BytesIO(zip_file.open('pkl').read()))
+        p.persistent_load = persistent_load(zip_file)
+        return p.load()
+def zipadd(func, zip_file, name):
+    """Calls a function with a file object, saving it to a zip file.
+    :param func: The function to call.
+    :type func: callable
+    :param zip_file: The zip file that `func` should write its data to.
+    :type zip_file: :class:`zipfile.ZipFile`
+    :param name: The name of the file inside of the zipped archive that `func`
+        should save its data to.
+    :type name: str
+    """
+    with tempfile.NamedTemporaryFile('wb', delete=False) as temp_file:
+        func(temp_file)
+        temp_file.close()
+        zip_file.write(temp_file.name, arcname=name)
+    if os.path.isfile(temp_file.name):
+        os.remove(temp_file.name)
--- a/theano/misc/tests/test_pkl_utils.py
+++ b/theano/misc/tests/test_pkl_utils.py
+import numpy
+from numpy.testing import assert_allclose
+from nose.plugins.skip import SkipTest
+import theano
+import theano.sandbox.cuda as cuda_ndarray
+from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda.var import CudaNdarraySharedVariable
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+from theano.misc.pkl_utils import dump, load
+def test_dump_load():
+    if not cuda_ndarray.cuda_enabled:
+        raise SkipTest('Optional package cuda disabled')
+    x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'),
+                                  [[1]], False)
+    with open('test', 'wb') as f:
+        dump(x, f)
+    with open('test', 'rb') as f:
+        x = load(f)
+    assert x.name == 'x'
+    assert_allclose(x.get_value(), [[1]])
+def test_dump_load_mrg():
+    rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled)
+    with open('test', 'wb') as f:
+        dump(rng, f)
+    with open('test', 'rb') as f:
+        rng = load(f)
+    assert type(rng) == MRG_RandomStreams
+def test_dump_zip_names():
+    foo_1 = theano.shared(0, name='foo')
+    foo_2 = theano.shared(1, name='foo')
+    with open('model.zip', 'wb') as f:
+        dump((foo_1, foo_2, numpy.array(2)), f)
+    keys = numpy.load('model.zip').keys()
+    assert keys == ['foo', 'foo_2', 'array_0', 'pkl']
+    foo = numpy.load('model.zip')['foo']
+    assert foo == numpy.array(0)
+    with open('model.zip', 'rb') as f:
+        foo_1, foo_2, array = load(f)
+    assert array == numpy.array(2)