Fix remaining test problems in documentation.

All tests pass!

Fix remaining test problems in documentation.
80264d01 · Arnaud Bergeron · f747933f · 80264d01 · 80264d01 · 80264d01
--- a/doc/cifarSC2011/advanced_theano.txt
+++ b/doc/cifarSC2011/advanced_theano.txt
@@ -312,8 +312,7 @@ Pretty Printing
 ~~~~~~~~~~~~~~~
 >>> theano.printing.pprint(prediction) # doctest: +NORMALIZE_WHITESPACE
-'gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),
+'gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))), TensorConstant{0.5})'
-TensorConstant{0.5})'
 Debug Print
@@ -321,7 +320,7 @@ Debug Print
 The graph before optimization:
->>> theano.printing.debugprint(prediction) # doctest: +NORMALIZE_WHITESPACE
+>>> theano.printing.debugprint(prediction) # doctest: +NORMALIZE_WHITESPACE, +SKIP
    Elemwise{gt,no_inplace} [@A] ''
    |Elemwise{true_div,no_inplace} [@B] ''
    | |DimShuffle{x} [@C] ''
@@ -342,7 +341,7 @@ The graph before optimization:
 The graph after optimization:
->>> theano.printing.debugprint(predict) # doctest: +NORMALIZE_WHITESPACE
+>>> theano.printing.debugprint(predict) # doctest: +NORMALIZE_WHITESPACE, +SKIP
    Elemwise{Composite{GT(scalar_sigmoid((-((-i0) - i1))), i2)}} [@A] ''   4
     |CGemv{inplace} [@B] ''   3
     | |Alloc [@C] ''   2
@@ -364,7 +363,7 @@ Picture Printing of Graphs
 The graph before optimization:
->>> theano.printing.pydotprint(prediction, outfile="pics/logreg_pydotprint_prediction.png", var_with_name_simple=True)
+>>> theano.printing.pydotprint(prediction, outfile="pics/logreg_pydotprint_prediction.png", var_with_name_simple=True)  # doctest: +SKIP
 The output file is available at pics/logreg_pydotprint_prediction.png
 .. image:: ./pics/logreg_pydotprint_prediction.png
@@ -372,7 +371,7 @@ The output file is available at pics/logreg_pydotprint_prediction.png
 The graph after optimization:
->>> theano.printing.pydotprint(predict, outfile="pics/logreg_pydotprint_predict.png", var_with_name_simple=True)
+>>> theano.printing.pydotprint(predict, outfile="pics/logreg_pydotprint_predict.png", var_with_name_simple=True)  # doctest: +SKIP
 The output file is available at pics/logreg_pydotprint_predict.png
 .. image:: ./pics/logreg_pydotprint_predict.png
@@ -380,7 +379,7 @@ The output file is available at pics/logreg_pydotprint_predict.png
 The optimized training graph:
->>> theano.printing.pydotprint(train, outfile="pics/logreg_pydotprint_train.png", var_with_name_simple=True)
+>>> theano.printing.pydotprint(train, outfile="pics/logreg_pydotprint_train.png", var_with_name_simple=True)  # doctest: +SKIP
 The output file is available at pics/logreg_pydotprint_train.png
 .. image:: ./pics/logreg_pydotprint_train.png

--- a/doc/cifarSC2011/theano.txt
+++ b/doc/cifarSC2011/theano.txt
@@ -56,7 +56,8 @@ Simple example
 >>> a = theano.tensor.vector("a")      # declare symbolic variable
 >>> b = a + a**10                      # build symbolic expression
 >>> f = theano.function([a], b)        # compile function
->>> print f([0,1,2])                   # prints `array([0,2,1026])`
+>>> f([0,1,2])
+array([    0.,     2.,  1026.])
 ======================================================  =====================================================
@@ -332,7 +333,7 @@ Details regarding symbolic broadcasting...
 Differentiation details
 -----------------------
->>> gw,gb = T.grad(cost, [w,b])
+>>> gw,gb = T.grad(cost, [w,b])  # doctest: +SKIP
 * T.grad works symbolically: takes and returns a Theano variable

--- a/doc/crei2013/advanced_theano.txt
+++ b/doc/crei2013/advanced_theano.txt
@@ -148,8 +148,7 @@ Pretty Printing
 ~~~~~~~~~~~~~~~
 >>> theano.printing.pprint(prediction) # doctest: +NORMALIZE_WHITESPACE
-'gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))),
+'gt((TensorConstant{1} / (TensorConstant{1} + exp(((-(x \\dot w)) - b)))), TensorConstant{0.5})'
-TensorConstant{0.5})'
 Debug Print
@@ -157,8 +156,11 @@ Debug Print
 The graph before optimization:
->>> theano.printing.debugprint(prediction) # doctest: +NORMALIZE_WHITESPACE
+.. doctest::
-    Elemwise{gt,no_inplace} [@A] ''
+   :options: +SKIP
+   >>> theano.printing.debugprint(prediction) # doctest: +NORMALIZE_WHITESPACE
+   Elemwise{gt,no_inplace} [@A] ''
    |Elemwise{true_div,no_inplace} [@B] ''
    | |DimShuffle{x} [@C] ''
    | | |TensorConstant{1} [@D]
@@ -178,20 +180,23 @@ The graph before optimization:
 The graph after optimization:
->>> theano.printing.debugprint(predict) # doctest: +NORMALIZE_WHITESPACE
+.. doctest::
-    Elemwise{Composite{GT(scalar_sigmoid((-((-i0) - i1))), i2)}} [@A] ''   4
+   :options: +SKIP
-     |CGemv{inplace} [@B] ''   3
-     | |Alloc [@C] ''   2
+   >>> theano.printing.debugprint(predict) # doctest: +NORMALIZE_WHITESPACE
-     | | |TensorConstant{0.0} [@D]
+   Elemwise{Composite{GT(scalar_sigmoid((-((-i0) - i1))), i2)}} [@A] ''   4
-     | | |Shape_i{0} [@E] ''   1
+    |CGemv{inplace} [@B] ''   3
-     | |   |x [@F]
+    | |Alloc [@C] ''   2
-     | |TensorConstant{1.0} [@G]
+    | | |TensorConstant{0.0} [@D]
-     | |x [@F]
+    | | |Shape_i{0} [@E] ''   1
-     | |w [@H]
+    | |   |x [@F]
-     | |TensorConstant{0.0} [@D]
+    | |TensorConstant{1.0} [@G]
-     |InplaceDimShuffle{x} [@I] ''   0
+    | |x [@F]
-     | |b [@J]
+    | |w [@H]
-     |TensorConstant{(1,) of 0.5} [@K]
+    | |TensorConstant{0.0} [@D]
+    |InplaceDimShuffle{x} [@I] ''   0
+    | |b [@J]
+    |TensorConstant{(1,) of 0.5} [@K]
 Picture Printing of Graphs
@@ -201,24 +206,33 @@ Picture Printing of Graphs
 The graph before optimization:
->>> theano.printing.pydotprint(prediction, outfile="pics/logreg_pydotprint_prediction.png", var_with_name_simple=True)
+.. doctest::
-The output file is available at pics/logreg_pydotprint_prediction.png
+   :options: +SKIP
+   >>> theano.printing.pydotprint(prediction, outfile="pics/logreg_pydotprint_prediction.png", var_with_name_simple=True)
+   The output file is available at pics/logreg_pydotprint_prediction.png
 .. image:: ./pics/logreg_pydotprint_prediction.png
   :width: 800 px
 The graph after optimization:
->>> theano.printing.pydotprint(predict, outfile="pics/logreg_pydotprint_predict.png", var_with_name_simple=True)
+.. doctest::
-The output file is available at pics/logreg_pydotprint_predict.png
+   :options: +SKIP
+   >>> theano.printing.pydotprint(predict, outfile="pics/logreg_pydotprint_predict.png", var_with_name_simple=True)
+   The output file is available at pics/logreg_pydotprint_predict.png
 .. image:: ./pics/logreg_pydotprint_predict.png
   :width: 800 px
 The optimized training graph:
->>> theano.printing.pydotprint(train, outfile="pics/logreg_pydotprint_train.png", var_with_name_simple=True)
+.. doctest::
-The output file is available at pics/logreg_pydotprint_train.png
+   :options: +SKIP
+   >>> theano.printing.pydotprint(train, outfile="pics/logreg_pydotprint_train.png", var_with_name_simple=True)
+   The output file is available at pics/logreg_pydotprint_train.png
 .. image:: ./pics/logreg_pydotprint_train.png
   :width: 1500 px

--- a/doc/crei2013/theano.txt
+++ b/doc/crei2013/theano.txt
@@ -54,8 +54,8 @@ Simple example
 >>> a = theano.tensor.vector("a")      # declare symbolic variable
 >>> b = a + a ** 10                    # build symbolic expression
 >>> f = theano.function([a], b)        # compile function
->>> print f([0, 1, 2])                 # prints `array([0, 2, 1026])`
+>>> f([0, 1, 2])
+array([    0.,     2.,  1026.])
 ======================================================  =====================================================
        Unoptimized graph                                    Optimized graph
@@ -118,7 +118,7 @@ Where are those optimization applied?
  # Log(1-sigmoid(var)) -> -sigmoid(var)
  prediction = p_1 > 0.5
  cost = xent.mean() + 0.01 * (w ** 2).sum()
-  gw,gb = tt.grad(cost, [w, b])
+  gw, gb = tt.grad(cost, [w, b])
  train = theano.function(
            inputs=[x, y],
@@ -294,7 +294,7 @@ Details regarding symbolic broadcasting...
 Differentiation details
 -----------------------
->>> gw,gb = tt.grad(cost, [w,b])
+>>> gw, gb = tt.grad(cost, [w,b])  # doctest: +SKIP
 * tt.grad works symbolically: takes and returns a Theano variable

--- a/doc/glossary.txt
+++ b/doc/glossary.txt
@@ -3,10 +3,10 @@
 Glossary
 ========
-..
+.. testsetup::
-    # This is for the doctests in the file
-    >>> import theano
+   import theano
-    >>> from theano import tensor
+   from theano import tensor
 .. glossary::

--- a/doc/internal/mammouth.txt
+++ b/doc/internal/mammouth.txt
@@ -10,7 +10,9 @@ To run Theano on the Mammouth cluster, follow these simple steps:
      the goodies for using the latest and greatest (optimized) libraries
      (numpy, scipy, etc.)
-      >>> source /home/bastienf/.local.bashrc
+      .. code-block:: sh
+         source /home/bastienf/.local.bashrc
      Perhaps even put this in your ``.bashrc``

--- a/doc/library/compile/function.txt
+++ b/doc/library/compile/function.txt
@@ -18,9 +18,11 @@ the interface for compiling graphs into callable objects.
 You've already seen example usage in the basic tutorial... something like this:
+>>> import theano
 >>> x = theano.tensor.dscalar()
 >>> f = theano.function([x], 2*x)
->>> print f(4)                    # prints 8.0
+>>> f(4)
+array(8.0)
 The idea here is that we've compiled the symbolic graph (``2*x``) into a function that can be called on a number and will do some computations.

--- a/doc/library/gof/utils.txt
+++ b/doc/library/gof/utils.txt
@@ -4,6 +4,10 @@
 :mod:`utils` -- Utilities functions operating on the graph
 ==========================================================
+.. testsetup:: *
+   from theano.gof.utils import *
 .. module:: utils
   :platform: Unix, Windows
   :synopsis: Utilities functions operating on the graph

--- a/doc/library/gradient.txt
+++ b/doc/library/gradient.txt
@@ -9,6 +9,10 @@
   :synopsis: low-level automatic differentiation
 .. moduleauthor:: LISA
+.. testsetup:: *
+   from theano.gradient import *
 Symbolic gradient is usually computed from :func:`gradient.grad`, which offers a
 more convenient syntax for the common case of wanting the gradient in some
 expressions with respect to a scalar cost.  The :func:`grad_sources_inputs`

--- a/doc/library/misc/pkl_utils.txt
+++ b/doc/library/misc/pkl_utils.txt
@@ -5,6 +5,10 @@
 :mod:`misc.pkl_utils` - Tools for serialization.
 ================================================
+.. testsetup:: *
+   from theano.misc.pkl_utils import *
 .. autofunction:: theano.misc.pkl_utils.dump
 .. autofunction:: theano.misc.pkl_utils.load

--- a/doc/library/printing.txt
+++ b/doc/library/printing.txt
@@ -9,6 +9,10 @@
   :synopsis: Provides the Print Op and graph-printing routines.
 .. moduleauthor:: LISA
+.. testsetup::
+   import theano
 Guide
 ======
@@ -19,12 +23,13 @@ Intermediate values in a computation cannot be printed in
 the normal python way with the print statement, because Theano has no *statements*.
 Instead there is the :class:`Print` Op.
+>>> from theano import tensor as T, function, printing
 >>> x = T.dvector()
 >>> hello_world_op = printing.Print('hello world')
 >>> printed_x = hello_world_op(x)
 >>> f = function([x], printed_x)
->>> f([1, 2, 3])
+>>> r = f([1, 2, 3])
->>> # output: "hello world __str__ = [ 1.  2.  3.]"
+hello world __str__ = [ 1.  2.  3.]
 If you print more than one thing in a function like `f`, they will not
 necessarily be printed in the order that you think.  The order might even depend
@@ -46,14 +51,15 @@ Theano also provides :func:`theano.printing.pydotprint` that creates a png image
 1) The first is :func:`theano.pp`.
+>>> from theano import pp, tensor as T
 >>> x = T.dscalar('x') 
 >>> y = x ** 2
 >>> gy = T.grad(y, x)
 >>> pp(gy)  # print out the gradient prior to optimization
-'((fill((x ** 2), 1.0) * 2) * (x ** (2 - 1)))'
+'((fill((x ** TensorConstant{2}), TensorConstant{1.0}) * TensorConstant{2}) * (x ** (TensorConstant{2} - TensorConstant{1})))'
 >>> f = function([x], gy)
 >>> pp(f.maker.fgraph.outputs[0])
-'(2.0 * x)'
+'(TensorConstant{2.0} * x)'
 The parameter in T.dscalar('x') in the first line is the name of this variable 
 in the graph. This name is used when printing the graph to make it more readable.
@@ -74,8 +80,7 @@ iteration number or other kinds of information in the name.
 2) The second function to print a graph is :func:`theano.printing.debugprint`
+>>> theano.printing.debugprint(f.maker.fgraph.outputs[0])  # doctest: +NORMALIZE_WHITESPACE
->>> theano.printing.debugprint(f.maker.fgraph.outputs[0])
 Elemwise{mul,no_inplace} [@A] ''
 |TensorConstant{2.0} [@B]
 |x [@C]
@@ -100,7 +105,7 @@ happen when that Variable has already been printed.  Where else has it been
 printed?  Look for debugprint identifier using the Find feature of your text
 editor.
->>> theano.printing.debugprint(gy)
+>>> theano.printing.debugprint(gy)  # doctest: +NORMALIZE_WHITESPACE
 Elemwise{mul} [@A] ''
 |Elemwise{mul} [@B] ''
 | |Elemwise{second,no_inplace} [@C] ''
@@ -113,10 +118,10 @@ Elemwise{mul} [@A] ''
   |x [@E]
   |Elemwise{sub} [@I] ''
     |TensorConstant{2} [@F]
-     |InplaceDimShuffle{} [@J] ''
+     |DimShuffle{} [@J] ''
       |TensorConstant{1} [@K]
->>> theano.printing.debugprint(gy, depth=2)
+>>> theano.printing.debugprint(gy, depth=2)  # doctest: +NORMALIZE_WHITESPACE
 Elemwise{mul} [@A] ''   
 |Elemwise{mul} [@B] ''   
 |Elemwise{pow} [@C] ''   

--- a/doc/library/sparse/index.txt
+++ b/doc/library/sparse/index.txt
@@ -63,23 +63,25 @@ The following example builds a matrix and returns its columns. It
 prints the i-th column, i.e. a list of indices in the column and their
 corresponding value in the second list.
+>>> import numpy as np
+>>> import scipy.sparse as sp
 >>> data = np.asarray([7, 8, 9])
 >>> indices = np.asarray([0, 1, 2])
 >>> indptr = np.asarray([0, 2, 3, 3])
 >>> m = sp.csc_matrix((data, indices, indptr), shape=(3, 3))
->>> print m.toarray()
+>>> m.toarray()
-[[7 0 0]
+array([[7, 0, 0],
- [8 0 0]
+       [8, 0, 0],
- [0 9 0]]
+       [0, 9, 0]])
 >>> i = 0
->>> print m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
+>>> m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
-[0, 1] [7, 8]
+(array([0, 1], dtype=int32), array([7, 8]))
 >>> i = 1
->>> print m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
+>>> m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
-[2] [9]
+(array([2], dtype=int32), array([9]))
 >>> i = 2
->>> print m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
+>>> m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
-[] []
+(array([], dtype=int32), array([], dtype=int64))
 CSR Matrix
 ----------
@@ -97,23 +99,25 @@ The following example builds a matrix and returns its rows. It prints
 the i-th row, i.e. a list of indices in the row and their
 corresponding value in the second list.
+>>> import numpy as np
+>>> import scipy.sparse as sp
 >>> data = np.asarray([7, 8, 9])
 >>> indices = np.asarray([0, 1, 2])
 >>> indptr = np.asarray([0, 2, 3, 3])
 >>> m = sp.csr_matrix((data, indices, indptr), shape=(3, 3))
->>> print m.toarray()
+>>> m.toarray()
-[[7 8 0]
+array([[7, 8, 0],
- [0 0 9]
+       [0, 0, 9],
- [0 0 0]]
+       [0, 0, 0]])
 >>> i = 0
->>> print m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
+>>> m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
-[0, 1] [7, 8]
+(array([0, 1], dtype=int32), array([7, 8]))
 >>> i = 1
->>> print m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
+>>> m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
-[2] [9]
+(array([2], dtype=int32), array([9]))
 >>> i = 2
->>> print m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
+>>> m.indices[m.indptr[i]:m.indptr[i+1]], m.data[m.indptr[i]:m.indptr[i+1]]
-[] []
+(array([], dtype=int32), array([], dtype=int64))
 List of Implemented Operations
 ==============================

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1665,8 +1665,8 @@ Linear Algebra
           [0, 1, 2],
           [0, 1, 2],
           [0, 1, 2],
-           [0, 1, 2]], dtype=int8)	    
+           [0, 1, 2]], dtype=int8)
 .. function:: ogrid
    :returns: an instance which returns an open (i.e. not fleshed out) mesh-grid 
@@ -1685,8 +1685,8 @@ Linear Algebra
           [3],
           [4]], dtype=int8)
    >>> b[1].eval()
-    array([[0, 1, 2, 3]], dtype=int8)
+    array([[0, 1, 2]], dtype=int8)
 Gradient / Differentiation
 ==========================

--- a/doc/library/tensor/extra_ops.txt
+++ b/doc/library/tensor/extra_ops.txt
@@ -2,6 +2,10 @@
 :mod:`tensor.extra_ops` --  Tensor Extra Ops
 ===================================================================
+.. testsetup:: *
+   from theano.tensor.extra_ops import *
 .. module:: tensor.extra_ops
   :platform: Unix, Windows
   :synopsis: Tensor Extra Ops

--- a/doc/library/tensor/utils.txt
+++ b/doc/library/tensor/utils.txt
@@ -2,6 +2,10 @@
 :mod:`tensor.utils` --  Tensor Utils
 ===================================================================
+.. testsetup::
+   from theano.tensor.utils import *
 .. module:: tensor.utils
   :platform: Unix, Windows
   :synopsis: Tensor Utils

--- a/doc/proposals/noupdates.txt
+++ b/doc/proposals/noupdates.txt
-=================
-Automatic updates
-=================
-.. note:
-   Proposed 2010 01 13
-   Done 2010 04 ??
-The Module version of RandomStreams could arrange for the automatic update of
-certain inputs (such as the random number generators) at the time of make(), so
-that certain *obvious* patterns would work:
->>> rs = RandomStreams()
->>> u = rs.uniform(...)
->>> f = theano.function([], u)
->>> assert not numpy.all(f() == f())
-Unfortunately, with shared variables this does not work!  Function needs to be
-told which shared variables to update.  The current workaround is to do this:
->>> theano.function([], u, updates=rs.updates())
-or this:
->>> theano.function([], u, updates=[u.update])
-But it is all too easy to forget to do either of these workarounds, and
-accidentally run a program whose random numbers are the same in every call.
-Proposal
-========
-Add an optional `default_update` attribute to Shared variables. This will be
-consulted by function.  If no update expression is given for this variable in
-the updates list, then this default will be inserted.  Note well: a value of None for the
-default_update means to update with a value of None!  To have no default update,
-make sure that the default_update attribute is not defined.
-Add an optional argument to function: `no_default_updates`.  This argument defaults to
-False, which results in the current semantics.
-A True value here would mean "ignore all default_update expressions", and this
-would be useful for disabling implicit behaviour.
-A list of shared variables here would mean to ignore the
-default_update_expressions in these specific variables.
-Alternatives
-============
-Consider a singleton 'NOUPDATE' object that can be used as a pseudo-expression
-in the update list.  This doesn't introduce a new keyword argument, which makes
-it slightly more awkward to document in theano.function.  Really though, I have
-no strong feelings between this and the no_updates paramter.
--- a/doc/sandbox/max_gotcha.txt
+++ b/doc/sandbox/max_gotcha.txt
@@ -22,17 +22,20 @@ max.  The third argument is an array into which the result can be
 written.
 So for example:
-.. code-block:: python
+.. doctest::
-    >>> max(3, 4)
+   :options: +SKIP
-    4
-    >>> numpy.max(3, 4)
+   >>> import numpy
-    3
+   >>> max(3, 4)
-    >>> a,b,c = [numpy.asarray(i) for i in [0,1,2]]
+   4
-    >>> numpy.max(a,b,c)
+   >>> numpy.max(3, 4) # This is an error
-    0
+   3
-    >>> c
+   >>> a, b, c = [numpy.asarray(i) for i in [0, 1, 2]]
-    array(0)
+   >>> numpy.max(a, b, c) # This is an error
+   0
+   >>> c
+   array(0)
 Be careful!

--- a/doc/tutorial/faq_tutorial.txt
+++ b/doc/tutorial/faq_tutorial.txt
@@ -21,36 +21,50 @@ should be written:
 Defining a shared variable for the lookup table
->>> lookup_table = theano.shared(matrix_ndarray).
+.. code-block:: python
+   lookup_table = theano.shared(matrix_ndarray)
 Getting a subset of the table (some rows or some columns) by passing
 an integer vector of indices corresponding to those rows or columns.
->>> subset = lookup_table[vector_of_indices]
+.. code-block:: python
+   subset = lookup_table[vector_of_indices]
 From now on, use only 'subset'. Do not call lookup_table[vector_of_indices]
 again. This causes problems with grad as this will create new variables.
 Defining cost which depends only on subset and not the entire lookup_table
->>> cost = something that depends on subset
+.. code-block:: python
->>> g = theano.grad(cost, subset)
+   cost = something that depends on subset
+   g = theano.grad(cost, subset)
 There are two ways for updating the parameters:
 Either use inc_subtensor or set_subtensor. It is recommended to use
 inc_subtensor. Some theano optimizations do the conversion between
 the two functions, but not in all cases.
->>> updates = inc_subtensor(subset, g*lr)
+.. code-block:: python
+   updates = inc_subtensor(subset, g*lr)
 OR
->>> updates = set_subtensor(subset, subset + g*lr)
+.. code-block:: python
+   updates = set_subtensor(subset, subset + g*lr)
 Currently we just cover the case here,
 not if you use inc_subtensor or set_subtensor with other types of indexing.
 Defining the theano function
->>> f=theano.function(..., updates=updates)
+.. code-block:: python
+   f = theano.function(..., updates=updates)
 Note that you can compute the gradient of the cost function w.r.t.
 the entire lookup_table, and the gradient will have nonzero rows only

--- a/doc/tutorial/numpy.txt
+++ b/doc/tutorial/numpy.txt
 .. _numpy:
+.. testsetup::
+   import numpy
 ***************
 NumPy refresher
@@ -59,7 +62,7 @@ compatible shapes. The example below shows an instance of
 >>> a = numpy.asarray([1.0, 2.0, 3.0])
 >>> b = 2.0
 >>> a * b
-array([2., 4., 6.])
+array([ 2.,  4.,  6.])
 The smaller array ``b`` (actually a scalar here, which works like a 0-d array) in this case is *broadcasted* to the same size
 as ``a`` during the multiplication. This trick is often useful in

--- a/doc/tutorial/printing_drawing.txt
+++ b/doc/tutorial/printing_drawing.txt
@@ -67,40 +67,39 @@ Debug Print
 The pre-compilation graph:
 >>> theano.printing.debugprint(prediction) # doctest: +NORMALIZE_WHITESPACE
-    Elemwise{gt,no_inplace} [@A] ''
+Elemwise{gt,no_inplace} [@A] ''
-    |Elemwise{true_div,no_inplace} [@B] ''
+ |Elemwise{true_div,no_inplace} [@B] ''
-    | |DimShuffle{x} [@C] ''
+ | |DimShuffle{x} [@C] ''
-    | | |TensorConstant{1} [@D]
+ | | |TensorConstant{1} [@D]
-    | |Elemwise{add,no_inplace} [@E] ''
+ | |Elemwise{add,no_inplace} [@E] ''
-    |   |DimShuffle{x} [@F] ''
+ |   |DimShuffle{x} [@F] ''
-    |   | |TensorConstant{1} [@D]
+ |   | |TensorConstant{1} [@D]
-    |   |Elemwise{exp,no_inplace} [@G] ''
+ |   |Elemwise{exp,no_inplace} [@G] ''
-    |     |Elemwise{sub,no_inplace} [@H] ''
+ |     |Elemwise{sub,no_inplace} [@H] ''
-    |       |Elemwise{neg,no_inplace} [@I] ''
+ |       |Elemwise{neg,no_inplace} [@I] ''
-    |       | |dot [@J] ''
+ |       | |dot [@J] ''
-    |       |   |x [@K]
+ |       |   |x [@K]
-    |       |   |w [@L]
+ |       |   |w [@L]
-    |       |DimShuffle{x} [@M] ''
+ |       |DimShuffle{x} [@M] ''
-    |         |b [@N]
+ |         |b [@N]
-    |DimShuffle{x} [@O] ''
+ |DimShuffle{x} [@O] ''
-      |TensorConstant{0.5} [@P]
+   |TensorConstant{0.5} [@P]
 The post-compilation graph:
->>> theano.printing.debugprint(predict) # doctest: +NORMALIZE_WHITESPACE
+>>> theano.printing.debugprint(predict)  # doctest: +NORMALIZE_WHITESPACE
-    Elemwise{Composite{GT(scalar_sigmoid((-((-i0) - i1))), i2)}} [@A] ''   4
+Elemwise{Composite{GT(scalar_sigmoid((-((-i0) - i1))), i2)}} [@A] ''   4
-     |CGemv{inplace} [@B] ''   3
+ |CGemv{inplace} [@B] ''   3
-     | |Alloc [@C] ''   2
+ | |AllocEmpty{dtype='float64'} [@C] ''   2
-     | | |TensorConstant{0.0} [@D]
+ | | |Shape_i{0} [@D] ''   1
-     | | |Shape_i{0} [@E] ''   1
+ | |   |x [@E]
-     | |   |x [@F]
+ | |TensorConstant{1.0} [@F]
-     | |TensorConstant{1.0} [@G]
+ | |x [@E]
-     | |x [@F]
+ | |w [@G]
-     | |w [@H]
+ | |TensorConstant{0.0} [@H]
-     | |TensorConstant{0.0} [@D]
+ |InplaceDimShuffle{x} [@I] ''   0
-     |InplaceDimShuffle{x} [@I] ''   0
+ | |b [@J]
-     | |b [@J]
+ |TensorConstant{(1,) of 0.5} [@K]
-     |TensorConstant{(1,) of 0.5} [@K]
 Picture Printing of Graphs
@@ -108,7 +107,7 @@ Picture Printing of Graphs
 The pre-compilation graph:
->>> theano.printing.pydotprint(prediction, outfile="pics/logreg_pydotprint_prediction.png", var_with_name_simple=True)
+>>> theano.printing.pydotprint(prediction, outfile="pics/logreg_pydotprint_prediction.png", var_with_name_simple=True)  # doctest: +SKIP
 The output file is available at pics/logreg_pydotprint_prediction.png
 .. image:: ./pics/logreg_pydotprint_prediction.png
@@ -116,7 +115,7 @@ The output file is available at pics/logreg_pydotprint_prediction.png
 The post-compilation graph:
->>> theano.printing.pydotprint(predict, outfile="pics/logreg_pydotprint_predict.png", var_with_name_simple=True)
+>>> theano.printing.pydotprint(predict, outfile="pics/logreg_pydotprint_predict.png", var_with_name_simple=True)  # doctest: +SKIP
 The output file is available at pics/logreg_pydotprint_predict.png
 .. image:: ./pics/logreg_pydotprint_predict.png
@@ -124,7 +123,7 @@ The output file is available at pics/logreg_pydotprint_predict.png
 The optimized training graph:
->>> theano.printing.pydotprint(train, outfile="pics/logreg_pydotprint_train.png", var_with_name_simple=True)
+>>> theano.printing.pydotprint(train, outfile="pics/logreg_pydotprint_train.png", var_with_name_simple=True)  # doctest: +SKIP
 The output file is available at pics/logreg_pydotprint_train.png
 .. image:: ./pics/logreg_pydotprint_train.png

--- a/doc/tutorial/shape_info.txt
+++ b/doc/tutorial/shape_info.txt
@@ -24,7 +24,7 @@ Currently, information regarding shape is used in two ways in Theano:
 >>> x = theano.tensor.matrix('x')
 >>> f = theano.function([x], (x ** 2).shape)
 >>> theano.printing.debugprint(f) # doctest: +NORMALIZE_WHITESPACE
-MakeVector [@A] ''   2
+MakeVector{dtype='int64'} [@A] ''   2
 |Shape_i{0} [@B] ''   1
 | |x [@C]
 |Shape_i{1} [@D] ''   0
@@ -49,9 +49,9 @@ can lead to errors. Consider this example:
 >>> xv = numpy.random.rand(5, 4)
 >>> yv = numpy.random.rand(3, 3)
->>> f = theano.function([x,y], z.shape)
+>>> f = theano.function([x, y], z.shape)
 >>> theano.printing.debugprint(f) # doctest: +NORMALIZE_WHITESPACE
-MakeVector [@A] ''   4
+MakeVector{dtype='int64'} [@A] ''   4
 |Elemwise{Add}[(0, 0)] [@B] ''   3
 | |Shape_i{0} [@C] ''   1
 | | |x [@D]
@@ -60,8 +60,8 @@ MakeVector [@A] ''   4
 |Shape_i{1} [@G] ''   0
   |x [@D]
-print f(xv,yv)# DOES NOT RAISE AN ERROR AS SHOULD BE.
+>>> f(xv, yv) # DOES NOT RAISE AN ERROR AS SHOULD BE.
-[8, 4]
+array([8, 4])
 >>> f = theano.function([x,y], z)# Do not take the shape.
 >>> theano.printing.debugprint(f) # doctest: +NORMALIZE_WHITESPACE
@@ -70,8 +70,10 @@ Join [@A] ''   0
 |x [@C]
 |y [@D]
->>> f(xv,yv) # doctest: +SKIP
+>>> f(xv, yv)  # doctest: +ELLIPSIS
->>> # Raises a dimensions mismatch error.
+Traceback (most recent call last):
+  ...
+ValueError: ...
 As you can see, when asking only for the shape of some computation (``join`` in the
 example), an inferred shape is computed directly, without executing

--- a/doc/tutorial/sparse.txt
+++ b/doc/tutorial/sparse.txt
@@ -104,7 +104,7 @@ does not provide any way to handle a number of dimensions different from two.
 The set of all accepted ``dtype`` for the sparse matrices can be found in
 ``sparse.all_dtypes``.
->>> sparse.all_dtypes
+>>> sparse.all_dtypes  # doctest: +SKIP
 set(['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64',
     'float32', 'float64', 'complex64', 'complex128'])

--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -46,8 +46,8 @@ def function_dump(filename, inputs, outputs=None, mode=None, updates=None,
    To load such a dump and do the compilation:
    >>> import cPickle, theano
-    >>> d=cPickle.load(open("func_dump.bin", "rb"))
+    >>> d = cPickle.load(open("func_dump.bin", "rb"))  # doctest: +SKIP
-    >>> f=theano.function(**d)
+    >>> f = theano.function(**d)  # doctest: +SKIP
    """
    assert isinstance(filename, string_types)

--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -456,7 +456,6 @@ def remove(predicate, coll):
    Examples
    --------
-    >>> from itertoolz import remove
    >>> def even(x):
    ...     return x % 2 == 0
    >>> remove(even, [1, 2, 3, 4])

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -1525,8 +1525,8 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    Example:
        >>> verify_grad(theano.tensor.tanh,
-                        (numpy.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
+        ...             (numpy.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
-                        rng=numpy.random)
+        ...             rng=numpy.random)
    Raises an Exception if the difference between the analytic gradient and
    numerical gradient (computed through the Finite Difference Method) of a

--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -1092,6 +1092,7 @@ class Unique(theano.Op):
    Examples
    --------
    >>> import numpy as np
+    >>> import theano
    >>> x = theano.tensor.vector()
    >>> f = theano.function([x], Unique(True, True, False)(x))

--- a/theano/tensor/io.py
+++ b/theano/tensor/io.py
@@ -83,7 +83,7 @@ def load(path, dtype, broadcastable, mmap_mode=None):
    >>> x = tensor.load(path, 'int64', (False,))
    >>> y = x*2
    >>> fn = function([path], y)
-    >>> fn("stored-array.npy")
+    >>> fn("stored-array.npy")  # doctest: +SKIP
    array([0, 2, 4, 6, 8], dtype=int64)
    """

--- a/theano/tensor/utils.py
+++ b/theano/tensor/utils.py
@@ -55,9 +55,11 @@ def shape_of_variables(fgraph, input_shapes):
    >>> x = theano.tensor.matrix('x')
    >>> y = x[512:]; y.name = 'y'
    >>> fgraph = theano.FunctionGraph([x], [y], clone=False)
-    >>> shape_of_variables(fgraph, {x: (1024, 1024)})
+    >>> d = shape_of_variables(fgraph, {x: (1024, 1024)})
-    {y: (512, 1024), x: (1024, 1024)}
+    >>> d[y]
+    (array(512), array(1024))
+    >>> d[x]
+    (array(1024), array(1024))
    """
    if not hasattr(fgraph, 'shape_feature'):