Merge pull request #3528 from nouiz/mixed4

Mixed: crash fix, doc, doc memory/speed trade-off, fix tests

Merge pull request #3528 from nouiz/mixed4
9c19d300 · Frédéric Bastien · f0bd940e · 45d67974 · 9c19d300 · 9c19d300
--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -225,3 +225,42 @@ preferable not to use those dtypes together.

 To help you find where float64 are created, see the
 :attr:`warn_float64` Theano flag.
+
+Theano memory/speed trade-off
+-----------------------------
+
+There is a few things you can easily do to change the trade-off
+between speed and memory usage. It nothing is said, this affect the
+CPU and GPU memory usage.
+
+Could speed up and lower memory usage:
+
+- :ref:`CuDNN <libdoc_cuda_dnn>` default CuDNN convolution use less
+   memory then Theano version. But some flags allow it to use more
+   memory. GPU only.
+- Shortly avail, multi-GPU.
+
+Could raise memory usage but speed up computation:
+
+- :attr:`config.lib.cnmem` =1  # Do not raise much memory usage, but if you are at the limit of GPU memory available. GPU only.
+- :attr:`config.allow_gc` =False
+- :attr:`config.optimizer_excluding` =low_memory , GPU only for now.
+
+Could lower the memory usage, but raise computation time:
+
+- :attr:`config.scan.allow_gc` =True # Probably not significant slowdown if config.lib.cnmem is used.
+- :attr:`config.scan.allow_output_prealloc` =False
+- Use :func:`batch_normalization()
+  <theano.tensor.nnet.bn.batch_normalization>`. It use less memory
+  then building a corresponding Theano graph.
+- Disable one or scan more optimizations:
+    - ``optimizer_excluding=scanOp_pushout_seqs_ops``
+    - ``optimizer_excluding=scan_pushout_dot1``
+    - ``optimizer_excluding=scanOp_pushout_output``
+- Disable all optimization tagged as raising memory usage:
+  ``optimizer_excluding=more_mem`` (currently only the 3 scan optimizations above)
+- `float16 <https://github.com/Theano/Theano/issues/2908>`_.
+
+If you want to analyze the memory usage during computation, the
+simplest is to let the memory error happen during Theano execution and
+use the Theano flags :attr:`exception_verbosity=high`.
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -208,7 +208,7 @@ import theano and print the config variable, as in:
    significant speed up on functions with many ops that are fast to
    execute, but this increases Theano's memory usage.

-.. attribute:: scan.allow_output_prealloc
+.. attribute:: config.scan.allow_output_prealloc

    Bool value, either ``True`` or ``False``

@@ -219,6 +219,18 @@ import theano and print the config variable, as in:
    give a significant speed up with Scan at the cost of slightly increased
    memory usage.

+.. attribute:: config.scan.allow_gc
+
+    Bool value, either ``True`` or ``False``
+
+    Default: ``False``
+
+    Allow/disallow gc inside of Scan.
+
+     If config.allow_gc is ``True``, but config.scan.allow_gc is
+    ``False``, then we will gc the inner of scan after all
+    iterations. This is the default.
+
 .. attribute:: openmp

    Bool value: either True or False
@@ -612,7 +624,7 @@ import theano and print the config variable, as in:
    cost of precision.  This also disables support for denormal
    numbers.

-.. attribute:: optimizer_excluding
+.. attribute:: config.optimizer_excluding

    Default: ""


--- a/doc/library/gof/graph.txt
+++ b/doc/library/gof/graph.txt
+.. _libdoc_gof_graph:
+
+==============================================
+:mod:`graph` -- Interface for the Theano graph
+==============================================
+
+.. module:: graph
+   :platform: Unix, Windows
+   :synopsis: Interface for types of symbolic variables
+.. moduleauthor:: LISA
+
+---------
+Reference
+---------
+
+.. automodule:: theano.gof.graph
+    :members:
--- a/doc/library/gof/index.txt
+++ b/doc/library/gof/index.txt
@@ -13,6 +13,7 @@
 .. toctree::
    :maxdepth: 1

+    graph
    fgraph
    toolbox
    type

--- a/doc/library/gof/type.txt
+++ b/doc/library/gof/type.txt
@@ -4,7 +4,7 @@
 :mod:`type` -- Interface for types of variables
 ================================================

-.. module:: fgraph
+.. module:: type
   :platform: Unix, Windows
   :synopsis: Interface for types of symbolic variables
 .. moduleauthor:: LISA

--- a/doc/tutorial/adding.txt
+++ b/doc/tutorial/adding.txt
@@ -117,7 +117,8 @@ then be used like a normal Python function.
 .. note::

    As a shortcut, you can skip step 3, and just use a variable's
-    :func:`eval` method. The :func:`eval` method is not as flexible
+    :func:`eval <theano.gof.graph.Variable.eval>` method.
+    The :func:`eval` method is not as flexible
    as :func:`function` but it can do everything we've covered in
    the tutorial so far. It has the added benefit of not requiring
    you to import :func:`function` . Here is how :func:`eval` works:

--- a/doc/tutorial/modes.txt
+++ b/doc/tutorial/modes.txt
@@ -137,7 +137,7 @@ is controlled by the value of the ``mode`` parameter.

 Theano defines the following modes by name:

- ``'FAST_COMPILE'``: Apply just a few graph optimizations and only use Python implementations.
+- ``'FAST_COMPILE'``: Apply just a few graph optimizations and only use Python implementations. So GPU is disabled.
 - ``'FAST_RUN'``: Apply all optimizations and use C implementations where possible.
 - ``'DebugMode``: Verify the correctness of all optimizations, and compare C and Python 
    implementations. This mode can take much longer than the other modes, but can identify

--- a/doc/tutorial/using_gpu.txt
+++ b/doc/tutorial/using_gpu.txt
@@ -783,5 +783,9 @@ Modify and execute to support *stride* (i.e. to avoid constraining the input to
 Note
 ----

-See :ref:`example_other_random` to know how to handle random numbers
-on the GPU.
+* See :ref:`example_other_random` to know how to handle random numbers
+  on the GPU.
+
+* The mode `FAST_COMPILE` disables C code, so also disables the GPU. You
+  can use the Theano flag optimizer='fast_compile' to speed up
+  compilation and keep the GPU.
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -560,29 +560,33 @@ class Function(object):
        Copy this function. Copied function will have separated maker and
        fgraph with original function. User can choose whether to separate
        storage by changing the share_memory arguments.
-        ---------------------
-        Params:
-            share_memory -- { boolean } Default is False. When True, two
-            function share intermediate storages(storages except input and
+
+        Parameters
+        ----------
+        share_memory : boolean
+            When True, two function share intermediate storages(storages except input and
            output storages). Otherwise two functions will only share partial
            storages and same maker. If two functions share memory and
            allow_gc=False, this will increase executing speed and save memory.

-            swap -- { dict } Dictionary that map old SharedVariables to new
+        swap : dict
+            Dictionary that map old SharedVariables to new
            SharedVariables. Default is None.
            NOTE: The shared variable swap in only done in the new returned
            function, not in the user graph.

-            delete_updates -- { boolean } Default is False. If True, Copied
-            function will not have update.
-
-            name -- { string } If provided, will be the name of the new
+        delete_updates : boolean
+            If True, Copied function will not have updates.
+        name : string
+            If provided, will be the name of the new
            Function. Otherwise, it will be old + " copy"

-            profile -- as theano.function profile parameter
-        ---------------------
-        Returns:
-            func -- Copied theano.Function
+        profile :
+            as theano.function profile parameter
+
+        Returns
+        -------
+        Copied theano.Function
        """
        # helper function
        def checkSV(sv_ori, sv_rpl):

--- a/theano/gof/destroyhandler.py
+++ b/theano/gof/destroyhandler.py
@@ -116,21 +116,14 @@ def _contains_cycle(fgraph, orderings):

        # this is faster than calling get_parents
        owner = var.owner
-        if owner:
-            parents = [owner]
-        else:
-            parents = []
-
        # variables don't appear in orderings, so we don't need to worry
        # about that here
-
-        if parents:
-            for parent in parents:
-                # insert node in node_to_children[r]
-                # (if r is not already in node_to_children,
-                # intialize it to [])
-                node_to_children.setdefault(parent, []).append(var)
-            parent_counts[var] = len(parents)
+        if owner:
+            # insert node in node_to_children[r]
+            # (if r is not already in node_to_children,
+            # intialize it to [])
+            node_to_children.setdefault(owner, []).append(var)
+            parent_counts[var] = 1
        else:
            visitable.append(var)
            parent_counts[var] = 0

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -451,6 +451,30 @@ class Variable(Node):
        inputs_to_values
            A dictionary mapping theano Variables to values.

+        Examples
+        --------
+
+        >>> import theano.tensor as T
+        >>> x = T.dscalar('x')
+        >>> y = T.dscalar('y')
+        >>> z = x + y
+        >>> z.eval({x : 16.3, y : 12.1})
+        array(28.4)
+
+        We passed :func:`eval` a dictionary mapping symbolic theano
+        variables to the values to substitute for them, and it returned
+        the numerical value of the expression.
+
+        Notes
+        -----
+
+        `eval` will be slow the first time you call it on a variable --
+        it needs to call :func:`function` to compile the expression behind
+        the scenes. Subsequent calls to :func:`eval` on that same variable
+        will be fast, because the variable caches the compiled function.
+
+        This way of computing has more overhead than a normal Theano
+        function, so don't use it too much in real scripts.
        """

        if inputs_to_values is None:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1423,9 +1423,11 @@ class PatternSub(LocalOptimizer):
    def __init__(self, in_pattern, out_pattern,
                 allow_multiple_clients=False,
                 skip_identities_fn=None, name=None, pdb=False,
-                 tracks=(), get_nodes=None):
+                 tracks=(), get_nodes=None,
+                 values_eq_approx=None):
        self.in_pattern = in_pattern
        self.out_pattern = out_pattern
+        self.values_eq_approx = values_eq_approx
        if isinstance(in_pattern, (list, tuple)):
            self.op = self.in_pattern[0]
        elif isinstance(in_pattern, dict):
@@ -1467,6 +1469,8 @@ class PatternSub(LocalOptimizer):
                ret = self.transform(real_node, get_nodes=False)
                if ret is not False and ret is not None:
                    assert len(real_node.outputs) == len(ret)
+                    if self.values_eq_approx:
+                        ret.tag.values_eq_approx = self.values_eq_approx
                    return dict(izip(real_node.outputs, ret))

        if node.op != self.op:
@@ -1550,8 +1554,10 @@ class PatternSub(LocalOptimizer):
                else:
                    return pattern.clone()
            p = self.out_pattern
-            new = build(p, u)
-            return [new]
+            ret = build(p, u)
+            if self.values_eq_approx:
+                ret.tag.values_eq_approx = self.values_eq_approx
+            return [ret]
        else:
            return False


--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -743,7 +743,6 @@ class GpuCAReduce(GpuOp):
            %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims);
            if (NULL == %(z)s)
            {
-                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
                %(fail)s;
            }
        }
@@ -1832,7 +1831,7 @@ class GpuCAReduce(GpuOp):
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [13]  # the version corresponding to the c code in this Op
+        version = [14]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2233,11 +2233,14 @@ def local_gpu_eye(node):
        if (host_input.owner and
            isinstance(host_input.owner.op, tensor.Eye) and
            host_input.owner.op.dtype == "float32"):
-
+            if tensor.extract_constant(host_input.owner.inputs[2]) != 0:
+                return
            return [gpu_eye(*host_input.owner.inputs)]
    if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
        if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
                for i in node.inputs]):
+            if tensor.extract_constant(node.inputs[2]) != 0:
+                return
            return [host_from_gpu(gpu_eye(*node.inputs))]
    return False


--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1224,7 +1224,7 @@ def test_shared_cudandarray():


 def test_gpueye():
-    def check(dtype, N, M_=None):
+    def check(dtype, N, M_=None, K=0):
        # Theano does not accept None as a tensor.
        # So we must use a real value.
        M = M_
@@ -1234,22 +1234,24 @@ def test_gpueye():
            M = N
        N_symb = T.iscalar()
        M_symb = T.iscalar()
-        k_symb = numpy.asarray(0)
+        k_symb = numpy.asarray(K)
        out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
        f = theano.function([N_symb, M_symb],
                            B.as_cuda_ndarray_variable(out),
                            mode=mode_with_gpu)
        result = numpy.asarray(f(N, M))
-        utt.assert_allclose(result, numpy.eye(N, M_, dtype=dtype))
+        utt.assert_allclose(result, numpy.eye(N, M_, K, dtype=dtype))
        assert result.dtype == numpy.dtype(dtype)
-        assert any([isinstance(node.op, B.GpuEye)
-                    for node in f.maker.fgraph.toposort()])
+        if K == 0:
+            assert any([isinstance(node.op, B.GpuEye)
+                        for node in f.maker.fgraph.toposort()])

    for dtype in ['float32']:
        yield check, dtype, 3
        # M != N, k = 0
        yield check, dtype, 3, 5
        yield check, dtype, 5, 3
+        yield check, dtype, 5, 3, 1


 class test_size(unittest.TestCase):

--- a/theano/sandbox/tests/test_blocksparse.py
+++ b/theano/sandbox/tests/test_blocksparse.py
@@ -212,8 +212,9 @@ class BlockSparse_Gemv_and_Outer(unittest.TestCase):
        def op(b, h, W):
            return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)

-        utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode)
-        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode)
+        eps = 3e-3
+        utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode, eps=eps)
+        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)

    def test_sparseblockgemv_grad_1(self):
        """

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -2683,10 +2683,10 @@ class __ComparisonOpSD(gof.op.Op):
        x, y = as_sparse_variable(x), tensor.as_tensor_variable(y)

        assert y.type.ndim == 2
+        out = tensor.TensorType(dtype='uint8', broadcastable=(False, False))()
        return gof.Apply(self,
                         [x, y],
-                         [SparseType(dtype='uint8',
-                                     format=x.type.format)()])
+                         [out])

    def perform(self, node, inputs, outputs):
        (x, y) = inputs
@@ -2694,7 +2694,9 @@ class __ComparisonOpSD(gof.op.Op):
        assert _is_sparse(x)
        assert x.shape == y.shape
        assert _is_dense(y)
-        out[0] = self.comparison(x, y).astype('uint8')
+        o = self.comparison(x, y).astype('uint8')
+        o = numpy.asarray(o)
+        out[0] = o

    def infer_shape(self, node, ins_shapes):
        return [ins_shapes[0]]

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2992,7 +2992,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
    For gpu, if you specify dtype=float32, everything will be done on the gpu.

    """
-
+    input = as_tensor_variable(input)
    if op:
        if dtype not in (None, 'float64'):
            raise NotImplementedError(

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -18,6 +18,7 @@ from theano.configparser import AddConfigVar, BoolParam
 from theano.printing import pprint
 from theano.tensor import basic as tensor
 from theano.tensor import elemwise, opt, NotScalarConstantError
+from theano.tensor.type import values_eq_approx_remove_inf


 ############
@@ -314,6 +315,9 @@ theano.compile.optdb['uncanonicalize'].register("local_hard_sigmoid",


 class ScalarSoftplus(scalar.UnaryScalarOp):
+    """
+    This helps numerical stability.
+    """
    @staticmethod
    def static_impl(x):
        if x < -30.0:
@@ -378,6 +382,7 @@ logsigm_to_softplus = gof.PatternSub(
    (tensor.log, (sigmoid, 'x')),
    (tensor.neg, (softplus, (tensor.neg, 'x'))),
    allow_multiple_clients=True,
+    values_eq_approx=values_eq_approx_remove_inf,
    skip_identities_fn=_skip_mul_1)


@@ -403,12 +408,14 @@ log1msigm_to_softplus = gof.PatternSub(
            (sigmoid, 'x'))),
    (tensor.neg, (softplus, 'x')),
    allow_multiple_clients=True,
+    values_eq_approx=values_eq_approx_remove_inf,
    skip_identities_fn=_skip_mul_1)

 log1pexp_to_softplus = gof.PatternSub(
    (tensor.log1p,
     (tensor.exp, 'x')),
    (softplus, 'x'),
+    values_eq_approx=values_eq_approx_remove_inf,
    allow_multiple_clients=True)

 opt.register_stabilize(logsigm_to_softplus, name='logsigm_to_softplus')

--- a/theano/tensor/nnet/tests/test_conv3d.py
+++ b/theano/tensor/nnet/tests/test_conv3d.py
@@ -412,10 +412,10 @@ class TestConv3D(utt.InferShapeTester):
        H_shape = self.H_shape_func()

        # make index maps
-        h = N.zeros(H_shape[1:])
-        r = N.zeros(H_shape[1:])
-        c = N.zeros(H_shape[1:])
-        t = N.zeros(H_shape[1:])
+        h = N.zeros(H_shape[1:], dtype='int32')
+        r = N.zeros(H_shape[1:], dtype='int32')
+        c = N.zeros(H_shape[1:], dtype='int32')
+        t = N.zeros(H_shape[1:], dtype='int32')

        for qi in xrange(0, H_shape[4]):
            h[:, :, :, qi] = qi

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -4464,6 +4464,10 @@ class T_mean(unittest.TestCase):
        data = rand(50)
        assert numpy.allclose(f(data), numpy.mean(data))

+    def test_list(self):
+        ll = [theano.shared(0.), theano.shared(2.)]
+        tensor.mean(ll).eval() == 1
+

 class test_matinv(unittest.TestCase):

@@ -6090,11 +6094,16 @@ def test_var():
    assert numpy.allclose(numpy.var(a_val, axis=2), f(a_val))


-def test_sum_overflow():
-    """Ensure that overflow errors are a little bit harder to get"""
-    a = Tensor(dtype='int8', broadcastable=[False])()
-    f = function([a], sum(a))
-    assert f([1] * 300) == 300
+class T_sum(unittest.TestCase):
+    def test_sum_overflow(self):
+        """Ensure that overflow errors are a little bit harder to get"""
+        a = Tensor(dtype='int8', broadcastable=[False])()
+        f = function([a], sum(a))
+        assert f([1] * 300) == 300
+
+    def test_list(self):
+        ll = [theano.shared(0.), theano.shared(2.)]
+        tensor.sum(ll).eval() == 2


 @dec.skipif(

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -4229,7 +4229,9 @@ def test_constant_get_stabilized():
    """
    x2 = T.scalar()
    y2 = T.log(1 + T.exp(x2))
-    f2 = theano.function([x2], y2)
+    mode = theano.compile.get_default_mode()
+    mode.check_isfinite = False
+    f2 = theano.function([x2], y2, mode=mode)
    try:
        assert len(f2.maker.fgraph.toposort()) == 1
        assert f2.maker.fgraph.toposort()[0].op == \
@@ -4238,14 +4240,14 @@ def test_constant_get_stabilized():

        x = T.as_tensor_variable(800)
        y = T.log(1 + T.exp(x))
-        f = theano.function([], y)
+        f = theano.function([], y, mode=mode)
        assert len(f.maker.fgraph.toposort()) == 0
        assert numpy.isinf(f())

        # When this error is fixed, the following line should be ok.
        assert f() == 800, f()

-    except (AssertionError, theano.compile.debugmode.InvalidValueError):
+    except AssertionError:
        raise SkipTest('Theano optimizes constant before stabilization. '
                       'This breaks stabilization optimization in some '
                       'cases. See #504.')