Merge pull request #3576 from nouiz/mixed6

[ENG] Doc, blas timing, better error, doc to use pydot-ng, function_dump now strip some stuff.

Merge pull request #3576 from nouiz/mixed6
97cf87dc · Pascal Lamblin · 639534c9 · 7b4507cd · 97cf87dc · 97cf87dc
--- a/doc/install.txt
+++ b/doc/install.txt
@@ -63,8 +63,9 @@ The following libraries and software are optional:
    `Git <http://git-scm.com>`_
        To download bleeding-edge versions of Theano.
-    `pydot <https://code.google.com/p/pydot/>`_
+    `pydot-ng <https://github.com/pydot/pydot-ng>`_ or `pydot <https://code.google.com/p/pydot/>`_
        To be able to make picture of Theano computation graph.
+	pydot-ng is a pydot compatible replacement that support newer Python.
    `NVIDIA CUDA drivers and SDK`_
        Required for GPU code generation/execution on NVIDIA gpus

--- a/doc/library/misc/pkl_utils.txt
+++ b/doc/library/misc/pkl_utils.txt
@@ -13,6 +13,10 @@
 .. autofunction:: theano.misc.pkl_utils.load
+.. autoclass:: theano.misc.pkl_utils.StripPickler
+.. autoclass:: theano.misc.pkl_utils.CompatUnpickler
 .. seealso::
    :ref:`tutorial_loadsave`

--- a/doc/library/sandbox/neighbours.txt
+++ b/doc/library/sandbox/neighbours.txt
@@ -9,8 +9,4 @@
   :synopsis: Neighbours Ops
 .. moduleauthor:: LISA
-API
+:ref:`Moved <libdoc_tensor_nnet_neighbours>`
-===
-.. automodule:: theano.sandbox.neighbours
-    :members:
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -8,11 +8,13 @@ Basic Tensor Functionality
 .. testsetup::
+   import theano
   import theano.tensor as T
   from theano.tensor import scalar, iscalar, TensorType, dmatrix, ivector
   from theano.tensor import set_subtensor, inc_subtensor, batched_dot
   from theano import shared
   import numpy
+   import numpy as np
 Theano supports any kind of Python object, but its focus is support for
 symbolic matrix expressions.  When you type,
@@ -656,6 +658,7 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
        `len(reps)` must be equal and, if specified, `ndim` must be
        equal to both.
+.. autofunction:: roll
 Creating Tensor
 ===============
@@ -1108,14 +1111,14 @@ Theano indexing with a "mask" (incorrect approach):
   <BLANKLINE>
          [[3, 4, 5],
           [3, 4, 5],
-           [3, 4, 5]]], dtype=int8)
+           [3, 4, 5]]])
 Getting a Theano result like NumPy:
 .. doctest:: indexing
   >>> t[(t > 4).nonzero()].eval()
-   array([5, 6, 7, 8], dtype=int8)
+   array([5, 6, 7, 8])
 The gradient of Advanced indexing needs in many cases NumPy
 1.8. It is not released yet as of April 30th, 2013. You can use NumPy
@@ -1742,13 +1745,13 @@ Linear Algebra
           [1, 1, 1],
           [2, 2, 2],
           [3, 3, 3],
-           [4, 4, 4]], dtype=int8)
+           [4, 4, 4]])
    >>> a[1].eval()
    array([[0, 1, 2],
           [0, 1, 2],
           [0, 1, 2],
           [0, 1, 2],
-           [0, 1, 2]], dtype=int8)
+           [0, 1, 2]])
 .. function:: ogrid
@@ -1766,9 +1769,9 @@ Linear Algebra
           [1],
           [2],
           [3],
-           [4]], dtype=int8)
+           [4]])
    >>> b[1].eval()
-    array([[0, 1, 2]], dtype=int8)
+    array([[0, 1, 2]])
 Gradient / Differentiation

--- a/doc/library/tensor/nnet/neighbours.txt
+++ b/doc/library/tensor/nnet/neighbours.txt
@@ -4,7 +4,7 @@
 :mod:`neighbours` -- Ops for working with images in convolutional nets
 =======================================================================
-.. module:: sandbox.neighbours
+.. module:: theano.tensor.nnet.neighbours
   :platform: Unix, Windows
   :synopsis: Ops for working with images in conv nets
 .. moduleauthor:: LISA

--- a/doc/library/tensor/signal/downsample.txt
+++ b/doc/library/tensor/signal/downsample.txt
@@ -9,6 +9,7 @@
   :synopsis: ops for performing various forms of downsampling
 .. moduleauthor:: LISA
+.. seealso:: :func:`theano.tensor.nnet.neighbours.images2neibs`
 .. autofunction:: theano.tensor.signal.downsample.max_pool_2d
 .. autofunction:: theano.tensor.signal.downsample.max_pool_2d_same_size

--- a/doc/tutorial/debug_faq.txt
+++ b/doc/tutorial/debug_faq.txt
@@ -567,3 +567,6 @@ the Ops from Theano and Pylearn2.
    # Where filename is a string to a file that we will write to.
 Then send us filename.
+.. autoclass:: theano.tests.breakpoint.PdbBreakpoint
--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -2,7 +2,6 @@
 Define the `function` function.
 """
-import six.moves.cPickle as pickle
 import logging
 import traceback as tb
@@ -24,7 +23,8 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
                  givens=None,
                  no_default_updates=False, accept_inplace=False, name=None,
                  rebuild_strict=True, allow_input_downcast=None, profile=None,
-                  on_unused_input=None):
+                  on_unused_input=None,
+                  extra_tag_to_remove=None):
    """
    This is helpful to make a reproducable case for problem during Theano
    compilation.
@@ -49,6 +49,11 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
    >>> d = cPickle.load(open("func_dump.bin", "rb"))  # doctest: +SKIP
    >>> f = theano.function(**d)  # doctest: +SKIP
+    Note:
+      The parameter extra_tag_to_remove, is passed to the StripPickler used.
+      To pickle graph made by Blocks, it must be:
+          ['annotations', 'replacement_of', 'aggregation_scheme', 'rolesc']
    """
    assert isinstance(filename, string_types)
    d = dict(inputs=inputs, outputs=outputs, mode=mode, updates=updates,
@@ -58,7 +63,11 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
             allow_input_downcast=allow_input_downcast, profile=profile,
             on_unused_input=on_unused_input)
    with open(filename, 'wb') as f:
-        pickle.dump(d, f, -1)
+        import theano.misc.pkl_utils
+        pickler = theano.misc.pkl_utils.StripPickler(
+            f, protocol=-1,
+            extra_tag_to_remove=extra_tag_to_remove)
+        pickler.dump(d)
 def function(inputs, outputs=None, mode=None, updates=None, givens=None,

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -1627,6 +1627,11 @@ class ScanProfileStats(ProfileStats):
        super(ScanProfileStats, self).__init__(atexit_print, **kwargs)
        self.name = name
+    def summary_globals(self, file):
+        # Do nothing, we don't want to print extra global summary
+        # here.
+        pass
    def summary_function(self, file):
        # RP: everytime we compile a function a ProfileStats is created for
        # that function. This means that everytime a optimization replaces

--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
@@ -308,7 +308,18 @@ class ReplaceValidate(History, Validator):
                msg = str(e)
                s1 = 'The type of the replacement must be the same'
                s2 = 'does not belong to this FunctionGraph'
-                if (s1 not in msg and s2 not in msg):
+                s3 = 'maximum recursion depth exceeded'
+                if s3 in msg:
+                    # There is nothing safe we can do to recover from this.
+                    # So don't revert as this raise a different error
+                    # that isn't helpful.
+                    e.args += (
+                        "Please, report this to theano-dev mailing list."
+                        " As a temporary work around, you can raise Python"
+                        " stack limit with:"
+                        " import sys; sys.setrecursionlimit(10000)",)
+                    raise
+                elif (s1 not in msg and s2 not in msg):
                    out = sys.stderr
                    print("<<!! BUG IN FGRAPH.REPLACE OR A LISTENER !!>>",
                          type(e), e, reason, file=out)

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -259,18 +259,18 @@ if __name__ == "__main__":
        K600
        GTX Titan X       0.45s  0.47s
-        GTX Titan Black          0.64s
+        GTX Titan Black   0.64s  0.64s
        GTX Titan(D15U-50)
        GTX 780
        GTX 980
        GTX 970
        GTX 680                  1.57s
        GRID K520
-        GTX 750 Ti               2.01s
+        GTX 750 Ti        2.01s  2.01s
-        GTX 580                         2.47s
+        GTX 750           2.46s  2.37s
-        GTX 480                         2.88s
+        GTX 660           2.32s  2.32s
-        GTX 660                  2.32s
+        GTX 580           2.42s         2.47s
-        GTX 750                  2.37s
+        GTX 480           2.87s         2.88s
        GT 610                   33.5s
        """)

--- a/theano/misc/pkl_utils.py
+++ b/theano/misc/pkl_utils.py
@@ -55,12 +55,19 @@ class StripPickler(Pickler):
        strip_pickler.dump(fn_args)
        f.close()
    """
+    def __init__(self, file, protocol=0, extra_tag_to_remove=None):
+        # Can't use super as Pickler isn't a new style class
+        Pickler.__init__(self, file, protocol)
+        self.tag_to_remove = ['trace', 'test_value']
+        if extra_tag_to_remove:
+            self.tag_to_remove.extend(extra_tag_to_remove)
    def save(self, obj):
        # Remove the tag.trace attribute from Variable and Apply nodes
        if isinstance(obj, theano.gof.utils.scratchpad):
-            if hasattr(obj, 'trace'):
+            for tag in self.tag_to_remove:
-                del obj.trace
+                if hasattr(obj, tag):
+                    del obj.__dict__[tag]
        # Remove manually-added docstring of Elemwise ops
        elif (isinstance(obj, theano.tensor.Elemwise)):
            if '__doc__' in obj.__dict__:
@@ -107,12 +114,38 @@ def load_reduce(self):
 if PY3:
    class CompatUnpickler(pickle._Unpickler):
+        """
+        Allow to reload in python 3 some pickled numpy ndarray.
+        Examples
+        --------
+        with open(fname, 'rb') as fp:
+            if PY3:
+                u = CompatUnpickler(fp, encoding="latin1")
+            else:
+                u = CompatUnpickler(fp)
+            mat = u.load()
+        """
        pass
    # Register `load_reduce` defined above in CompatUnpickler
    CompatUnpickler.dispatch[pickle.REDUCE[0]] = load_reduce
 else:
    class CompatUnpickler(pickle.Unpickler):
+        """
+        Allow to reload in python 3 some pickled numpy ndarray.
+        Examples
+        --------
+        with open(fname, 'rb') as fp:
+            if PY3:
+                u = CompatUnpickler(fp, encoding="latin1")
+            else:
+                u = CompatUnpickler(fp)
+            mat = u.load()
+        """
        pass

--- a/theano/misc/tests/test_pkl_utils.py
+++ b/theano/misc/tests/test_pkl_utils.py
@@ -13,7 +13,7 @@ import theano.sandbox.cuda as cuda_ndarray
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.var import CudaNdarraySharedVariable
 from theano.sandbox.rng_mrg import MRG_RandomStreams
-from theano.misc.pkl_utils import dump, load
+from theano.misc.pkl_utils import dump, load, StripPickler
 class T_dump_load(unittest.TestCase):
@@ -69,3 +69,25 @@ class T_dump_load(unittest.TestCase):
        with open('model.zip', 'rb') as f:
            foo_1, foo_2, foo_3, array = load(f)
        assert array == numpy.array(3)
+class TestStripPickler(unittest.TestCase):
+    def setUp(self):
+        # Work in a temporary directory to avoid cluttering the repository
+        self.origdir = os.getcwd()
+        self.tmpdir = mkdtemp()
+        os.chdir(self.tmpdir)
+    def tearDown(self):
+        # Get back to the original dir, and delete the temporary one
+        os.chdir(self.origdir)
+        if self.tmpdir is not None:
+            shutil.rmtree(self.tmpdir)
+    def test0(self):
+        with open('test.pkl', 'wb') as f:
+            m = theano.tensor.matrix()
+            dest_pkl = 'my_test.pkl'
+            f = open(dest_pkl, 'wb')
+            strip_pickler = StripPickler(f, protocol=-1)
+            strip_pickler.dump(m)
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -429,6 +429,10 @@ def use(device,
                # query the active GPU. If we check the active GPU before
                # the device is initialized we will always receive 0
                # event if another device is selected later.
+                if not hasattr(cuda_ndarray.cuda_ndarray, 'select_a_gpu'):
+                    raise Exception(
+                        "Delete your Theano cache. The automatic"
+                        " recompilation did not work.")
                cuda_ndarray.cuda_ndarray.select_a_gpu()
                use.device_number = active_device_number()
                # This is needed to initialize the cublas handle.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2888,7 +2888,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        out[0] = x
    def c_code_cache_version(self):
-        return (5,)
+        return (6,)
    def c_code(self, node, name, inputs, outputs, sub):
        if (self.set_instead_of_inc) or \
@@ -2951,7 +2951,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
             } else {
                 y_rowind_obj = PyInt_FromLong(j);
             }
-             row_y = CudaNdarray_Subscript(py_%(y)s, y_rowind_obj);
+             row_y = CudaNdarray_Subscript((PyObject*)%(y)s, y_rowind_obj);
             if (row_y == NULL) {
                  Py_XDECREF(row_y);
@@ -3302,7 +3302,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        return """
        PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
-                                                        (PyObject *) py_%(x)s);
+                                                        (PyObject *) %(x)s);
        if (! add_result )
        {
@@ -3318,7 +3318,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    def c_code_cache_version(self):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if parent_version:
-            return parent_version + (1,)
+            return parent_version + (2,)
        return ()

--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
@@ -159,7 +159,7 @@ class CURAND_Base(GpuOp):
        int odims[%(ndim)s];
        int n_elements = 1;
        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !CudaNdarray_Check(py_%(o_sample)s)
+                || !CudaNdarray_Check((PyObject*)%(o_sample)s)
                || (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
        if (PyArray_NDIM(%(size)s) != 1)
@@ -246,7 +246,7 @@ class CURAND_Base(GpuOp):
        return code
    def c_code_cache_version(self):
-        return (3,)
+        return (4,)
 class CURAND_Normal(CURAND_Base):

--- a/theano/sandbox/cuda/tests/test_memory.py
+++ b/theano/sandbox/cuda/tests/test_memory.py
@@ -131,13 +131,13 @@ def test_memory_lazy():
    executed in the graph. This mess with [c]vm gc implementation.
    """
    shapes = (50, 100)
-    # more_alloc1 and more_alloc2 is not the same for both dtype.
+    # more_alloc1 is not the same for both dtype.
    # when dtype is float32, the computation is done on the gpu.
    # This insert constant on the gpu during compilation
    # that raise the number of alloc.
    # When dtype is float64, only the shared is on the gpu and it is transferd
    # to the cpu for computation. So no extra alloc after compilation.
-    # more_alloc1 if after the first compilation, more_alloc2 after the second.
+    # more_alloc1 if after the first compilation
    for dtype, more_alloc1 in [("float32", 1),
                               ("float64", 0)]:
        print(dtype)

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -53,13 +53,13 @@ class MultinomialFromUniform(Op):
        return [T.zeros_like(x) for x in ins]
    def c_code_cache_version(self):
-        return (5,)
+        return (6,)
    def c_code(self, node, name, ins, outs, sub):
        (pvals, unis) = ins
        (z,) = outs
        if self.odtype == 'auto':
-            t = "PyArray_TYPE((PyArrayObject*) py_%(pvals)s)" % locals()
+            t = "PyArray_TYPE(%(pvals)s)" % locals()
        else:
            t = theano.scalar.Scalar(self.odtype).dtype_specs()[1]
            if t.startswith('theano_complex'):

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -445,7 +445,9 @@ class mrg_uniform(mrg_uniform_base):
            }
        }
        Py_XDECREF(%(o_rstate)s);
-        %(o_rstate)s = (PyArrayObject*)PyArray_FromAny(py_%(rstate)s, NULL, 0, 0, %(o_rstate_requirement)s,NULL);
+        %(o_rstate)s = (PyArrayObject*)PyArray_FromAny(
+            (PyObject*)%(rstate)s,
+            NULL, 0, 0, %(o_rstate_requirement)s,NULL);
        if (PyArray_NDIM(%(o_rstate)s) != 2)
        {
@@ -526,7 +528,7 @@ class mrg_uniform(mrg_uniform_base):
        """ % locals()
    def c_code_cache_version(self):
-        return (2,)
+        return (3,)
 class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
@@ -655,7 +657,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        int n_elements = 1;
        int n_streams, n_streams_used_in_this_call;
        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !CudaNdarray_Check(py_%(o_sample)s)
+                || !CudaNdarray_Check((PyObject*)%(o_sample)s)
                || !CudaNdarray_is_c_contiguous(%(o_sample)s)
                || (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
@@ -691,7 +693,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
                %(fail)s;
            }
        }
-        if (!CudaNdarray_Check(py_%(rstate)s))
+        if (!CudaNdarray_Check((PyObject*)%(rstate)s))
        {
            PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
            %(fail)s;
@@ -764,7 +766,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        """ % locals()
    def c_code_cache_version(self):
-        return (9,)
+        return (10,)
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
@@ -913,7 +915,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        unsigned int n_elements = 1;
        unsigned int n_streams;
        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !pygpu_GpuArray_Check(py_%(o_sample)s)
+                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
                || (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
@@ -950,7 +952,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                %(fail)s;
            }
        }
-        if (!pygpu_GpuArray_Check(py_%(rstate)s))
+        if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
        {
            PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
            %(fail)s;

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3968,7 +3968,7 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Join),
 def roll(x, shift, axis=None):
    """
-    Convenience function to roll `TensorType`s along the given axis.
+    Convenience function to roll TensorTypes along the given axis.
    Syntax copies numpy.roll function.
@@ -3986,7 +3986,7 @@ def roll(x, shift, axis=None):
    Returns
    -------
    tensor
-        Output tensor, with the same shape as `x`.
+        Output tensor, with the same shape as ``x``.
    """
    if axis is None:

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -973,7 +973,10 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                        if len(inc_shape) == len(data_shape) and (
                                len(inc_shapes) == 0 or inc_shape[0] != 1):
                            inc_shape = (n_to_inc,) + inc_shape[1:]
-                        inc_size = numpy.product(inc_shape)
+                        # The param dtype is needed when inc_shape is empty.
+                        # By default, it would return a float and rng.uniform
+                        # with NumPy 1.10 will raise a Deprecation warning.
+                        inc_size = numpy.product(inc_shape, dtype='int')
                        # Corresponding numeric variable.
                        inc_num = rng.uniform(size=inc_size).astype(self.dtype)
                        inc_num = inc_num.reshape(inc_shape)

--- a/theano/tests/breakpoint.py
+++ b/theano/tests/breakpoint.py
@@ -12,8 +12,8 @@ class PdbBreakpoint(Op):
    conditional breakpoint, inside a theano function, based on a symbolic
    scalar condition.
-    @type name: String
+    :type name: String
-    @param name: name of the conditional breakpoint. To be printed when the
+    :param name: name of the conditional breakpoint. To be printed when the
                 breakpoint is activated.
    :note: WARNING. At least one of the outputs of the op must be used