Merge github.com:Theano/Theano into doc_fix

b6ea8d67 · serdyuk · 99801b06 · 3a74c6ae · b6ea8d67 · b6ea8d67
--- a/doc/extending/other_ops.txt
+++ b/doc/extending/other_ops.txt
@@ -242,3 +242,41 @@ Numba Ops
 Want C speed without writing C code for your new Op? You can use Numba
 to generate the C code for you! Here is an `example
 Op <https://gist.github.com/nouiz/5492778#file-theano_op-py>`_ doing that.
+.. _alternate_theano_types:
+Alternate Theano Types
+======================
+Most ops in Theano are used to manipulate tensors. However, Theano also
+supports many other variable types. The supported types are listed below,
+along with pointers to the relevant documentation.
+*       :class:`TensorType <tensor.TensorType>` : Theano type that represents
+        a multidimensional array containing elements that all have the same
+        type. Variables of this Theano type are represented in C as objects of
+        class
+        `PyArrayObject <http://docs.scipy.org/doc/numpy/reference/c-api.types-and-structures.html#PyArrayObject>`_.
+*       :ref:`TypedList <libdoc_typed_list>` : Theano type that represents a
+        typed list (a list where every element in the list has the same Theano
+        type). Variables of this Theano type are represented in C as objects
+        of class `PyListObject <https://docs.python.org/2/c-api/list.html>`_.
+*       :ref:`Scalar <libdoc_scalar>` : Theano type that represents a C
+        primitive type. The C type associated with this Theano type is the
+        represented C primitive itself.
+*       :ref:`SparseType <sparse_ops>` : Theano type used to represent sparse
+        tensors. There is no equivalent C type for this Theano Type but you
+        can split a sparse variable into its parts as TensorVariables. Those
+        can then be used as inputs to an op with C code.
+*       :class:`Generic <theano.gof.type.Generic>` : Theano type that
+        represents a simple Python Object. Variables of this Theano type are
+        represented in C as objects of class `PyObject
+        <https://docs.python.org/2/c-api/structures.html#c.PyObject>`_.
+*       :class:`CDataType <theano.gof.type.CDataType>` :  Theano type that
+        represents a C data type. The C type associated with this Theano type
+        depends on the data being represented.
--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -4,7 +4,6 @@
 ==========================
 Frequently Asked Questions
 ==========================
 TypeError: object of type 'TensorVariable' has no len()
 -------------------------------------------------------
@@ -63,6 +62,13 @@ compilation but it will also use more memory because
 ``optimizer_excluding=inplace`` excludes inplace optimizations resulting
 in a trade off between speed of compilation and memory usage.
+Theano flag `reoptimize_unpickled_function` controls if an unpickled theano function
+should reoptimize its graph or not. Theano users can use the standard python pickle 
+tools to save a compiled theano function. When pickling, both graph before and 
+after the optimization are saved, including shared variables. When set to True, 
+the graph is reoptimized when being unpickled. Otherwise, skip the graph optimization 
+and use directly the optimized graph from the pickled file.
 Faster Theano function
 ----------------------

--- a/doc/library/compile/shared.txt
+++ b/doc/library/compile/shared.txt
@@ -47,21 +47,11 @@
        :type: class:`Container`
-.. function:: shared(value, name=None, strict=False, **kwargs)
+.. autofunction:: theano.compile.sharedvalue.shared
-    Return a :class:`SharedVariable` Variable, initialized with a copy or reference of `value`.
+.. function:: shared_constructor(ctor)
-    This function iterates over constructor functions (see `shared_constructor`) to find a
-    suitable SharedVariable subclass.  The suitable one is the first constructor
-    that doesn't raise an exception.
-    This function is meant as a convenient default.  If you want to use a
-    specific shared variable constructor, consider calling it directly.
-    .. note::
-        By passing `kwargs`, you effectively limit the set of potential constructors to those that
+    Append `ctor` to the list of shared constructors (see :func:`shared`).
-        can accept those kwargs.
    Each registered constructor ``ctor`` will be called like this:
@@ -69,12 +59,4 @@
        ctor(value, name=name, strict=strict, **kwargs)
-    .. attribute:: constructors
+    If it do not support given value, it must raise a TypeError.
-        A list of shared variable constructors that will be tried in reverse
-        order.
-.. function:: shared_constructor(ctor)
-    Append `ctor` to the list of shared constructors (see :func:`shared`).
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -335,7 +335,7 @@ import theano and print the config variable, as in:
    Default False
-    Do the memory profile print the min peak memory usage?
+    Does the memory profile print the min peak memory usage?
    It only works when profile=True, profile_memory=True
 .. attribute:: profiling.destination
@@ -462,6 +462,20 @@ import theano and print the config variable, as in:
    Link arguments to link against a (Fortran) level-3 blas implementation.
+.. attribute:: config.experimental.local_alloc_elemwise_assert
+    Bool value: either True or False
+    Default: True
+    When the local_alloc_optimization is applied, add an assert to highlight
+    shape errors.
+    Without such asserts this optimization could hide errors in the user code.
+    We add the assert only if we can't infer that the shapes are equivalent.
+    As such this optimization does not always introduce an assert in the graph.
+    Removing the assert could speed up execution.
 .. attribute:: config.cuda.root
    Default: $CUDA_ROOT or failing that, "/usr/local/cuda"
@@ -683,6 +697,16 @@ import theano and print the config variable, as in:
   optimization phase. Theano user's do not need to use this. This is
   to help debug shape error in Theano optimization.
+.. attribute:: config.reoptimize_unpickled_function
+    Bool value, default: True
+    Theano users can use the standard python pickle tools to save a compiled 
+    theano function. When pickling, both graph before and after the optimization 
+    are saved, including shared variables. When set to True, the graph is 
+    reoptimized when being unpickled. Otherwise, skip the graph optimization and 
+    use directly the optimized graph. 
 .. attribute:: config.exception_verbosity
    String Value: ``'low'``, ``'high'``.

--- a/doc/library/gradient.txt
+++ b/doc/library/gradient.txt
@@ -9,11 +9,11 @@
   :synopsis: low-level automatic differentiation
 .. moduleauthor:: LISA
-Symbolic gradient is usually computed from :func:`tensor.grad`, which offers a
+Symbolic gradient is usually computed from :func:`gradient.grad`, which offers a
 more convenient syntax for the common case of wanting the gradient in some
 expressions with respect to a scalar cost.  The :func:`grad_sources_inputs`
 function does the underlying work, and is more flexible, but is also more
-awkward to use when :func:`tensor.grad` can do the job.
+awkward to use when :func:`gradient.grad` can do the job.
 .. automodule:: theano.gradient

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -754,6 +754,8 @@ Creating Tensor
    >>> f(x, x, x, x).shape
    (2, 2, 4, 4)
+.. autofunction:: theano.tensor.basic.choose
 Reductions
 ==========
@@ -1630,125 +1632,11 @@ Linear Algebra
 Gradient / Differentiation
 ==========================
-.. function:: grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False)
+.. automodule:: theano.gradient
+    :members: grad
-    Return symbolic gradients for one or more variables with respect to some
-    cost.
+See the :ref:`gradient <libdoc_gradient>` page for complete documentation
+of the gradient module.
-    For more information about how automatic differentiation works in Theano,
-    see :mod:`gradient`. For information on how to implement the gradient of
-    a certain Op, see :func:`grad`.
-    :type cost: 0-d tensor variable
-    :type wrt: tensor variable or list of tensor variables
-    :type g_cost: same as type of `cost`
-    :type consider_constant: list of variables
-    :type warn_type: bool
-    :param cost: a scalar with respect to which we are differentiating
-    :param wrt: term[s] for which we want gradients
-    :param g_cost: the gradient on the cost
-    :param consider_constant: variables whose gradients will be held at 0.
-    :param warn_type: True will trigger warnings via the logging module when
-       the gradient on an expression has a different type than the original
-       expression
-    :rtype: variable or list of variables (matching `wrt`)
-    :returns: gradients of the cost with respect to each of the `wrt` terms
-.. function:: subgraph_grad(wrt, end, start=None, cost=None, details=False)
-    With respect to `wrt`, computes gradients of cost and/or from existing
-    `start` gradients, up to the `end` variables of a symbolic digraph.
-    In other words, computes gradients for a subgraph of the
-    symbolic theano function. Ignores all disconnected inputs.
-    This can be useful when one needs to perform the gradient descent
-    iteratively (e.g. one layer at a time in an MLP), or when a particular
-    operation is not differentiable in theano (e.g. stochastic sampling
-    from a multinomial). In the latter case, the gradient of the
-    non-differentiable process could be approximated by user-defined
-    formula, which could be calculated using the gradients of a cost
-    with respect to samples (0s and 1s). These gradients are obtained
-    by performing a subgraph_grad from the `cost` or previously known gradients
-    (`start`) up to the outputs of the stochastic process (`end`).
-    A dictionary mapping gradients obtained from the user-defined
-    differentiation of the process, to variables, could then be fed into
-    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
-    In an MLP, we could use subgraph_grad to iteratively backpropagate:
-    .. testcode:: subgraph_grad
-       import theano
-       import numpy as np
-       x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
-       w1 = theano.shared(np.random.randn(3,4))
-       w2 = theano.shared(np.random.randn(4,2))
-       a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
-       a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
-       cost2 = theano.tensor.sqr(a2 - t).sum()
-       cost2 += theano.tensor.sqr(w2.sum())
-       cost1 = theano.tensor.sqr(w1.sum())
-       params = [[w2],[w1]]
-       costs = [cost2,cost1]
-       grad_ends = [[a1], [x]]
-       next_grad = None
-       param_grads = []
-       for i in xrange(2):
-           param_grad, next_grad = theano.subgraph_grad(
-               wrt=params[i], end=grad_ends[i],
-               start=next_grad, cost=costs[i]
-           )
-           next_grad = dict(zip(grad_ends[i], next_grad))
-           param_grads.extend(param_grad)
-    :type wrt: list of variables
-    :param wrt:
-      Gradients are computed with respect to `wrt`.
-    :type end: list of variables
-    :param end:
-      Theano variables at which to end gradient descent (they are
-      considered constant in theano.grad).  For convenience, the
-      gradients with respect to these variables are also returned.
-    :type start: dictionary of variables
-    :param start:
-      If not None, a dictionary mapping variables to their
-      gradients. This is useful when the gradient on some variables
-      are known. These are used to compute the gradients backwards up
-      to the variables in `end` (they are used as known_grad in
-      theano.grad).
-    :type cost: scalar (0-dimensional) variable
-    :param cost:
-      Additional costs for which to compute the gradients.  For
-      example, these could be weight decay, an l1 constraint, MSE,
-      NLL, etc. May optionally be None if start is provided.
-      .. warning::
-        If the gradients of `cost` with respect to any of the `start`
-        variables is already part of the `start` dictionary, then it
-        may be counted twice with respect to `wrt` and `end`.
-    :type details: bool
-    :param details:
-      When True, additionally returns the list of gradients from
-      `start` and of `cost`, respectively, with respect to `wrt` (not
-      `end`).
-    :rtype: Tuple of 2 or 4 Lists of Variables
-    :return: Returns lists of gradients with respect to `wrt` and `end`,
-            respectively.
-    .. versionadded:: 0.6.1
 .. _R_op_list:

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -22,6 +22,33 @@
 .. moduleauthor:: LISA
+.. note::
+    As of October 21st, 2014, the default GPU image convolution
+    changed. Here is the algo:
+    - If we can use `cuDNN <https://developer.nvidia.com/cuDNN>`_, use it.
+    - If not, use gemm version (slower then cuDNN, uses more memory).
+    If the users do not want the extra memory usage of the gemm
+    version, they can enable the legacy code that is even slower, but
+    does not use extra memory. For this, use the Theano flag
+    ``optimizer_excluding=conv_gemm``.
+    There is no reason to use the legacy code or the gemm version if
+    cuDNN is available.
+    2 other options:
+    - There is also the fft version that is the fastest in some cases,
+      but uses even more memory. It does not support striding to remove
+      computation and has some shapes restriction.
+    - There is also the cuda_convnet convolution in Pylearn2. It uses a
+      different memory layout, has shapes restrictions, but does not use
+      extra memory and is faster then the legacy convolution.
 TODO: Give examples on how to use these things! They are pretty complicated.
 - Convolution operators implemented:

--- a/doc/proposals/advidx.txt
+++ b/doc/proposals/advidx.txt
-==================
-Advanced Indexing
-==================
-Continue the Advanced Indexing project that is on either github or bitbucket.
--- a/doc/tutorial/debug_faq.txt
+++ b/doc/tutorial/debug_faq.txt
@@ -17,6 +17,76 @@ Isolating the Problem/Testing Theano Compiler
 You can run your Theano function in a :ref:`DebugMode<using_debugmode>`.
 This tests the Theano optimizations and helps to find where NaN, inf and other problems come from.
+Interpreting Error Messages
+---------------------------
+Even in its default configuration, Theano tries to display useful error
+messages. Consider the following faulty code.
+.. code-block:: python
+    import numpy as np
+    import theano
+    import theano.tensor as T
+    x = T.vector()
+    y = T.vector()
+    z = x + x
+    z = z + y
+    f = theano.function([x, y], z)
+    f(np.ones((2,)), np.ones((3,)))
+Running the code above we see:
+.. code-block:: bash
+    Traceback (most recent call last):
+      File "test0.py", line 10, in <module>
+        f(np.ones((2,)), np.ones((3,)))
+      File "/PATH_TO_THEANO/theano/compile/function_module.py", line 605, in __call__
+        self.fn.thunks[self.fn.position_of_error])
+      File "/PATH_TO_THEANO/theano/compile/function_module.py", line 595, in __call__
+        outputs = self.fn()
+    ValueError: Input dimension mis-match. (input[0].shape[0] = 3, input[1].shape[0] = 2)
+    Apply node that caused the error: Elemwise{add,no_inplace}(<TensorType(float64, vector)>, <TensorType(float64, vector)>, <TensorType(float64, vector)>)
+    Inputs types: [TensorType(float64, vector), TensorType(float64, vector), TensorType(float64, vector)]
+    Inputs shapes: [(3,), (2,), (2,)]
+    Inputs strides: [(8,), (8,), (8,)]
+    Inputs scalar values: ['not scalar', 'not scalar', 'not scalar']
+    HINT: Re-running with most Theano optimization disabled could give you a back-traces when this node was created. This can be done with by setting the Theano flags 'optimizer=fast_compile'. If that does not work, Theano optimization can be disabled with 'optimizer=None'.
+    HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint of this apply node.
+Arguably the most useful information is approximately half-way through
+the error message, where the kind of error is displayed along with its
+cause (`ValueError: Input dimension mis-match. (input[0].shape[0] = 3,
+input[1].shape[0] = 2`).
+Below it, some other information is given, such as the apply node that
+caused the error, as well as the input types, shapes, strides and
+scalar values.
+The two hints can also be helpful when debugging. Using the theano flag
+``optimizer=fast_compile`` or ``optimizer=None`` can often tell you
+the faulty line, while ``exception_verbosity=high`` will display a
+debugprint of the apply node. Using these hints, the end of the error
+message becomes :
+.. code-block:: bash
+    Backtrace when the node is created:
+      File "test0.py", line 8, in <module>
+        z = z + y
+    Debugprint of the apply node:
+    Elemwise{add,no_inplace} [@A] <TensorType(float64, vector)> ''
+     |Elemwise{add,no_inplace} [@B] <TensorType(float64, vector)> ''
+     | |<TensorType(float64, vector)> [@C] <TensorType(float64, vector)>
+     | |<TensorType(float64, vector)> [@C] <TensorType(float64, vector)>
+     |<TensorType(float64, vector)> [@D] <TensorType(float64, vector)>
+We can here see that the error can be traced back to the line ``z = z + y``.
+For this example, using ``optimizer=fast_compile`` worked. If it did not,
+you could set ``optimizer=None`` or use test values.
 Using Test Values
 -----------------
@@ -26,13 +96,19 @@ on-the-fly, before a ``theano.function`` is ever compiled. Since optimizations
 haven't been applied at this stage, it is easier for the user to locate the
 source of some bug. This functionality is enabled through the config flag
 ``theano.config.compute_test_value``. Its use is best shown through the
-following example.
+following example. Here, we use ``exception_verbosity=high`` and
+``optimizer=fast_compile``, which would not tell you the line at fault.
+``optimizer=None`` would and it could therefore be used instead of test values.
 .. code-block:: python
+    import numpy
+    import theano
+    import theano.tensor as T
    # compute_test_value is 'off' by default, meaning this feature is inactive
-    theano.config.compute_test_value = 'off'
+    theano.config.compute_test_value = 'off' # Use 'warn' to activate this feature
    # configure shared variables
    W1val = numpy.random.rand(2, 10, 10).astype(theano.config.floatX)
@@ -42,6 +118,8 @@ following example.
    # input which will be of shape (5,10)
    x  = T.matrix('x')
+    # provide Theano with a default test-value
+    #x.tag.test_value = numpy.random.rand(5, 10)
    # transform the shared variable in some way. Theano does not
    # know off hand that the matrix func_of_W1 has shape (20, 10)
@@ -61,35 +139,32 @@ Running the above code generates the following error message:
 .. code-block:: bash
-    Definition in:
-      File "/u/desjagui/workspace/PYTHON/theano/gof/opt.py", line 1102, in apply
-        lopt_change = self.process_node(fgraph, node, lopt)
-      File "/u/desjagui/workspace/PYTHON/theano/gof/opt.py", line 882, in process_node
-        replacements = lopt.transform(node)
-      File "/u/desjagui/workspace/PYTHON/Theano/theano/tensor/blas.py", line 1030, in local_dot_to_dot22
-        return [_dot22(*node.inputs)]
-      File "/u/desjagui/workspace/PYTHON/Theano/theano/gof/op.py", line 324, in __call__
-        self.add_tag_trace(node)
-    For the full definition stack trace set the Theano flags traceback.limit to -1
    Traceback (most recent call last):
-      File "test.py", line 29, in <module>
+      File "test1.py", line 31, in <module>
-        f(numpy.random.rand(5,10))
+        f(numpy.random.rand(5, 10))
-      File "/u/desjagui/workspace/PYTHON/theano/compile/function_module.py", line 596, in __call__
+      File "PATH_TO_THEANO/theano/compile/function_module.py", line 605, in __call__
-        self.fn()
+        self.fn.thunks[self.fn.position_of_error])
-      File "/u/desjagui/workspace/PYTHON/theano/gof/link.py", line 288, in streamline_default_f
+      File "PATH_TO_THEANO/theano/compile/function_module.py", line 595, in __call__
-        raise_with_op(node)
+        outputs = self.fn()
-      File "/u/desjagui/workspace/PYTHON/theano/gof/link.py", line 284, in streamline_default_f
+    ValueError: Shape mismatch: x has 10 cols (and 5 rows) but y has 20 rows (and 10 cols)
-        thunk()
+    Apply node that caused the error: Dot22(x, DimShuffle{1,0}.0)
-      File "/u/desjagui/workspace/PYTHON/Theano/theano/gof/cc.py", line 1111, in execute
+    Inputs types: [TensorType(float64, matrix), TensorType(float64, matrix)]
-        raise exc_type, exc_value, exc_trace
+    Inputs shapes: [(5, 10), (20, 10)]
-    ValueError: ('Shape mismatch: x has 10 cols but y has 20 rows',
+    Inputs strides: [(80, 8), (8, 160)]
-    _dot22(x, <TensorType(float64, matrix)>), [_dot22.0],
+    Inputs scalar values: ['not scalar', 'not scalar']
-    _dot22(x, InplaceDimShuffle{1,0}.0), 'Sequence id of Apply node=4')
+    Debugprint of the apply node:
-Needless to say, the above is not very informative and does not provide much in
+    Dot22 [@A] <TensorType(float64, matrix)> ''
-the way of guidance. However, by instrumenting the code ever so slightly, we
+     |x [@B] <TensorType(float64, matrix)>
-can get Theano to reveal the exact source of the error.
+     |DimShuffle{1,0} [@C] <TensorType(float64, matrix)> ''
+       |Flatten{2} [@D] <TensorType(float64, matrix)> ''
+         |DimShuffle{2,0,1} [@E] <TensorType(float64, 3D)> ''
+           |W1 [@F] <TensorType(float64, 3D)>
+    HINT: Re-running with most Theano optimization disabled could give you a back-traces when this node was created. This can be done with by setting the Theano flags 'optimizer=fast_compile'. If that does not work, Theano optimization can be disabled with 'optimizer=None'.
+If the above is not informative enough, by instrumenting the code ever
+so slightly, we can get Theano to reveal the exact source of the error.
 .. code-block:: python
@@ -108,18 +183,22 @@ value. This allows Theano to evaluate symbolic expressions on-the-fly (by
 calling the ``perform`` method of each op), as they are being defined. Sources
 of error can thus be identified with much more precision and much earlier in
 the compilation pipeline. For example, running the above code yields the
-following error message, which properly identifies *line 23* as the culprit.
+following error message, which properly identifies *line 24* as the culprit.
 .. code-block:: bash
    Traceback (most recent call last):
-      File "test2.py", line 23, in <module>
+      File "test2.py", line 24, in <module>
-        h1 = T.dot(x,func_of_W1)
+        h1 = T.dot(x, func_of_W1)
-      File "/u/desjagui/workspace/PYTHON/Theano/theano/gof/op.py", line 360, in __call__
+      File "PATH_TO_THEANO/theano/tensor/basic.py", line 4734, in dot
-        node.op.perform(node, input_vals, output_storage)
+        return _dot(a, b)
-      File "/u/desjagui/workspace/PYTHON/Theano/theano/tensor/basic.py", line 4458, in perform
+      File "PATH_TO_THEANO/theano/gof/op.py", line 545, in __call__
+        required = thunk()
+      File "PATH_TO_THEANO/theano/gof/op.py", line 752, in rval
+        r = p(n, [x[0] for x in i], o)
+      File "PATH_TO_THEANO/theano/tensor/basic.py", line 4554, in perform
        z[0] = numpy.asarray(numpy.dot(x, y))
-    ValueError: ('matrices are not aligned', (5, 10), (20, 10))
+    ValueError: matrices are not aligned
 The ``compute_test_value`` mechanism works as follows:

--- a/doc/tutorial/extending_theano.txt
+++ b/doc/tutorial/extending_theano.txt
--- a/doc/tutorial/extending_theano_c.txt
+++ b/doc/tutorial/extending_theano_c.txt
--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
+.. _tut_multi_cores:
 =============================
 Multi cores support in Theano
 =============================

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -494,7 +494,8 @@ def char_from_number(number):
 def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
               file=sys.stdout, print_destroy_map=False,
               print_view_map=False, order=None, ids='CHAR',
-               stop_on_name=False, prefix_child=None):
+               stop_on_name=False, prefix_child=None,
+               scan_ops=None):
    """Print the graph leading to `r` to given depth.
    :param r: Variable instance
@@ -502,10 +503,10 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
    :param depth: maximum recursion depth (Default -1 for unlimited).
    :param done: dict of Apply instances that have already been printed
                 and their associated printed ids
-    :param print_type: wether to print the Variable type after the other infos
+    :param print_type: whether to print the Variable type after the other infos
    :param file: file-like object to which to print
-    :param print_destroy_map: wether to print the op destroy_map after ofther info
+    :param print_destroy_map: whether to print the op destroy_map after other info
-    :param print_view_map: wether to print the op view_map after ofther info
+    :param print_view_map: whether to print the op view_map after other info
    :param order: If not empty will print the index in the toposort.
    :param ids: How do we print the identifier of the variable
                id - print the python id value
@@ -514,6 +515,8 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.
+    :param scan_ops: Scan ops in the graph will be added inside this list
+                     for later printing purposes.
    """
    if depth == 0:
@@ -525,6 +528,9 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
    if done is None:
        done = dict()
+    if scan_ops is None:
+        scan_ops = []
    if print_type:
        type_str = ' <%s>' % r.type
    else:
@@ -575,37 +581,45 @@ def debugprint(r, prefix='', depth=-1, done=None, print_type=False,
        o = ''
        if order:
            o = str(order.index(r.owner))
        already_printed = a in done  # get_id_str put it in the dict
        id_str = get_id_str(a)
        if len(a.outputs) == 1:
            print >> file, '%s%s %s%s \'%s\' %s %s %s' % (prefix, a.op,
-                                                             id_str,
+                                                          id_str,
-                                                             type_str, r_name,
+                                                          type_str,
+                                                          r_name,
+                                                          destroy_map_str,
+                                                          view_map_str,
+                                                          o)
+        else:
+            print >> file, '%s%s.%i %s%s \'%s\' %s %s %s' % (prefix, a.op,
+                                                             a.outputs.index(r),
+                                                             id_str, type_str,
+                                                             r_name,
                                                             destroy_map_str,
                                                             view_map_str,
                                                             o)
-        else:
-            print >> file, '%s%s.%i %s%s \'%s\' %s %s %s' % (prefix, a.op,
-                                                            a.outputs.index(r),
-                                                            id_str, type_str,
-                                                            r_name,
-                                                            destroy_map_str,
-                                                            view_map_str,
-                                                            o)
        if not already_printed:
            if (not stop_on_name or
                not (hasattr(r, 'name') and r.name is not None)):
                new_prefix = prefix_child + ' |'
                new_prefix_child = prefix_child + ' |'
                for idx, i in enumerate(a.inputs):
                    if idx == len(a.inputs) - 1:
                        new_prefix_child = prefix_child + '  '
+                    if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
+                        if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
+                            scan_ops.append(i)
                    debugprint(i, new_prefix, depth=depth - 1, done=done,
                               print_type=print_type, file=file, order=order,
                               ids=ids, stop_on_name=stop_on_name,
-                               prefix_child=new_prefix_child)
+                               prefix_child=new_prefix_child, scan_ops=scan_ops)
    else:
        #this is an input variable
        id_str = get_id_str(r)
@@ -624,7 +638,6 @@ def _optcheck_fgraph(input_specs, output_specs, accept_inplace=False):
    :type accept_inplace: Bool
    :rtype: `FunctionGraph`
    :returns: a new FunctionGraph with a cloned graph, with debugging `Feature` instances already installed.
    """
    orig_inputs = [spec.variable for spec in input_specs]
    updates = [spec.update for spec in input_specs if spec.update]
@@ -2152,7 +2165,7 @@ class _Maker(FunctionMaker):  # inheritance buys a few helper functions
        # Check if some input variables are unused
        self._check_unused_inputs(inputs, outputs, on_unused_input)
-        # Make a list of (SymbolicInput|SymblicInputKits, indices, [SymbolicInput,...]), one 
+        # Make a list of (SymbolicInput|SymblicInputKits, indices, [SymbolicInput,...]), one
        # tuple for each input. (See Function.indices for more details)
        indices = [[input] + self.expand_in(input, _inputs) for input in inputs]

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
--- a/theano/compile/sharedvalue.py
+++ b/theano/compile/sharedvalue.py
@@ -169,15 +169,33 @@ def shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
    """Return a SharedVariable Variable, initialized with a copy or
    reference of `value`.
-    This function iterates over constructor functions (see
+    This function iterates over
-    `shared_constructor`) to find a suitable SharedVariable subclass.
+    :ref:`constructor functions <shared_constructor>`
+    to find a suitable SharedVariable subclass.
+    The suitable one is the first constructor that accept the given value.
+    This function is meant as a convenient default.  If you want to use a
+    specific shared variable constructor, consider calling it directly.
+    ``theano.shared`` is a shortcut to this function.
    :note: By passing kwargs, you effectively limit the set of
-    potential constructors to those that can accept those kwargs.
+        potential constructors to those that can accept those kwargs.
-    :note: Some shared variable have 'borrow' as extra kwargs.
+    :note: Some shared variable have ``borrow`` as extra kwargs.
           `See <http://deeplearning.net/software/theano/tutorial/aliasing.html#borrowing-when-creating-shared-variables>`_ for detail.
+    :note: Some shared variable have ``broadcastable`` as extra kwargs.
+        As shared variable shapes can change, all dimensions default
+        to not being broadcastable, even if ``value`` has a shape of 1
+        along some dimension. This parameter allows you to create
+        for example a `row` or `column` 2d tensor.
+    .. attribute:: constructors
+        A list of shared variable constructors that will be tried in reverse
+        order.
    """
    try:

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -118,6 +118,7 @@ AddConfigVar('print_active_device',
        BoolParam(True, allow_override=False),
        in_c_key=False)
 # Do not add FAST_RUN_NOGC to this list (nor any other ALL CAPS shortcut).
 # The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'.
 # The old all capital letter way of working is deprecated as it is not
@@ -465,6 +466,12 @@ AddConfigVar('unpickle_function',
             BoolParam(True),
             in_c_key=False)
+AddConfigVar('reoptimize_unpickled_function',
+        "Re-optimize the graph when a theano function is unpickled from the disk.",
+        BoolParam(True, allow_override=True),
+        in_c_key=False)
 """Note to developers:
    Generally your exceptions should use an apply node's __str__
    method when exception_verbosity == 'low'. When exception_verbosity
@@ -538,3 +545,11 @@ AddConfigVar('check_input',
              "(particularly for scalars) and reduce the number of generated C "
              "files.",
             BoolParam(True))
+AddConfigVar('cache_optimizations',
+             "WARNING: work in progress, does not work yet."
+             "Specify if the optimization cache should be used. This cache will"
+             "any optimized graph and its optimization. Actually slow downs a lot"
+             "the first optimization, and could possibly still contains some bugs."
+             "Use at your own risks.",
+             BoolParam(False))
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -55,7 +55,7 @@ from theano.gof.link import \
    Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
 from theano.gof.op import \
-    Op, OpenMPOp, PureOp, ops_with_inner_function
+    Op, OpenMPOp, PureOp, COp, ops_with_inner_function
 from theano.gof.opt import (
    Optimizer,

--- a/theano/gof/destroyhandler.py
+++ b/theano/gof/destroyhandler.py
@@ -662,6 +662,7 @@ class DestroyHandler(toolbox.Bookkeeper):
    The following data structures remain to be converted:
        <unknown>
    """
+    pickle_rm_attr = ["destroyers"]
    def __init__(self, do_imports_on_attach=True):
        self.fgraph = None
@@ -720,15 +721,7 @@ class DestroyHandler(toolbox.Bookkeeper):
                " or in conflict with another plugin.")
        ####### Annotate the FunctionGraph ############
+        self.unpickle(fgraph)
-        def get_destroyers_of(r):
-            droot, impact, root_destroyer = self.refresh_droot_impact()
-            try:
-                return [root_destroyer[droot[r]]]
-            except Exception:
-                return []
-        fgraph.destroyers = get_destroyers_of
        fgraph.destroy_handler = self
        self.fgraph = fgraph
@@ -743,6 +736,15 @@ class DestroyHandler(toolbox.Bookkeeper):
        if self.do_imports_on_attach:
            toolbox.Bookkeeper.on_attach(self, fgraph)
+    def unpickle(self, fgraph):
+        def get_destroyers_of(r):
+            droot, impact, root_destroyer = self.refresh_droot_impact()
+            try:
+                return [root_destroyer[droot[r]]]
+            except Exception:
+                return []
+        fgraph.destroyers = get_destroyers_of
    def refresh_droot_impact(self):
        """
        Makes sure self.droot, self.impact, and self.root_destroyer are

--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -87,6 +87,11 @@ class FunctionGraph(utils.object2):
        #TODO: document what variables are[not] set in the FunctionGraph when a feature
        is added via the constructor.  How constructed is the FunctionGraph?
+        Note: the intermediate nodes between 'inputs' and 'outputs' are not explicitely
+        passed.
+        :param inputs: inputs nodes of the graph, usually declared by the user
+        :param outputs: outputs nodes of the graph.
        :param clone: If true, we will clone the graph. This is
        useful to remove the constant cache problem.
@@ -724,17 +729,42 @@ class FunctionGraph(utils.object2):
        return self.__str__()
    ### clone ###
-    def clone(self):
+    def clone(self, check_integrity=True):
        """WRITEME"""
-        return self.clone_get_equiv()[0]
+        return self.clone_get_equiv(check_integrity)[0]
-    def clone_get_equiv(self):
+    def clone_get_equiv(self, check_integrity=True):
        """WRITEME"""
        equiv = graph.clone_get_equiv(self.inputs, self.outputs)
-        self.check_integrity()
+        if check_integrity:
+            self.check_integrity()
        e = FunctionGraph([equiv[i] for i in self.inputs],
                          [equiv[o] for o in self.outputs])
-        e.check_integrity()
+        if check_integrity:
+            e.check_integrity()
        for feature in self._features:
            e.attach_feature(feature)
        return e, equiv
+    def __getstate__(self):
+        """This is needed as some feature introduce instancemethod and
+        this is not pickable.
+        """
+        d = self.__dict__.copy()
+        for feature in self._features:
+            for attr in getattr(feature, "pickle_rm_attr", []):
+                del d[attr]
+        # The class Updater take fct as parameter and they are lambda function, so unpicklable.
+        # execute_callbacks_times have reference to optimizer, and they can't 
+        # be pickled as the decorators with parameters aren't pickable.
+        if "execute_callbacks_times" in d:
+            del d["execute_callbacks_times"]
+        return d
+    def __setstate__(self, dct):
+        self.__dict__.update(dct)
+        for feature in self._features:
+            if hasattr(feature, "unpickle"):
+                feature.unpickle(self)
--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -135,9 +135,14 @@ class Apply(Node):
            if len(self.outputs) == 1:
                return self.outputs[0]
            else:
-                raise AttributeError("%s.default_output should be an output index." % self.op)
+                raise AttributeError(
+                    "%s.default_output should be an output index." % self.op)
+        elif not isinstance(do, (int, long)):
+            raise AttributeError("%s.default_output should be an int or long" %
+                                 self.op)
        elif do < 0 or do >= len(self.outputs):
-            raise AttributeError("%s.default_output is out of range." % self.op)
+            raise AttributeError("%s.default_output is out of range." %
+                                 self.op)
        return self.outputs[do]
    def env_getter(self):
@@ -873,6 +878,7 @@ def is_same_graph(var1, var2, givens=None, debug=False):
    # Get result from the merge-based function.
    rval1 = is_same_graph_with_merge(var1=var1, var2=var2, givens=givens)
    # Get result from the function `equal_computations` from scan_utils.
    use_equal_computations = True
    if givens:
        # We need to build the `in_xs` and `in_ys` lists. To do this, we need

--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
@@ -1024,7 +1024,7 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {
 static PyObject * get_version(PyObject *dummy, PyObject *args)
 {
-  PyObject *result = PyFloat_FromDouble(0.20);
+  PyObject *result = PyFloat_FromDouble(0.21);
  return result;
 }

--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
@@ -14,7 +14,8 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.gof.lazylinker_c')
 force_compile = False
-version = 0.20  # must match constant returned in function get_version()
+version = 0.21  # must match constant returned in function get_version()
 def try_import():
    global lazylinker_ext
@@ -22,6 +23,7 @@ def try_import():
    import lazylinker_ext
    del sys.path[0]
 def try_reload():
    sys.path[0:0] = [config.compiledir]
    reload(lazylinker_ext)

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -154,9 +154,10 @@ def raise_with_op(node, thunk=None, exc_info=None):
    else:
        hints.append(
            "HINT: Re-running with most Theano optimization disabled could"
-            " give you a back-traces when this node was created. This can"
+            " give you a back-trace of when this node was created. This can"
-            " be done with by setting the Theano flags"
+            " be done with by setting the Theano flag"
-            " optimizer=fast_compile")
+            " 'optimizer=fast_compile'. If that does not work,"
+            " Theano optimizations can be disabled with 'optimizer=None'.")
    if theano.config.exception_verbosity == 'high':
        f = StringIO.StringIO()
@@ -616,6 +617,7 @@ class PerformLinker(LocalLinker):
        f.allow_gc = self.allow_gc #HACK: this is a way of passing an arg to Function.__call__
        add_clear_storage(f, computed, storage_map)
+        f.storage_map = storage_map
        return f, [Container(input, storage) for input, storage in zip(fgraph.inputs, input_storage)], \
            [Container(output, storage, True) for output, storage in zip(fgraph.outputs, output_storage)], \

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -13,6 +13,8 @@ __contact__   = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
 import logging
+import numpy
+import os
 import sys
 import warnings
@@ -974,3 +976,177 @@ int main( int argc, const char* argv[] )
        self.update_self_openmp()
        return super(OpenMPOp, self).make_thunk(node, storage_map,
                                                compute_map, no_recycling)
+class COp(Op):
+    """ Class to allow an op to have an external C implementation.
+    An op can use this class by inheriting from it and calling its
+    __init__() method, providing it with a path to an external file containing
+    the C implementation and the name of the function, in that file, to call
+    to perform the computations for the op.
+    """
+    def __init__(self, func_file, func_name):
+        self.func_file = func_file
+        self.func_name = func_name
+        # Define the markers that can be used to delimit sections in the
+        # external C code
+        self.support_code_marker = "THEANO_SUPPORT_CODE_SECTION"
+        self.apply_code_marker = "THEANO_APPLY_CODE_SECTION"
+        self.c_code_markers = [self.support_code_marker,
+                               self.apply_code_marker]
+        # Load the external C code
+        f = open(self.func_file, "r")
+        self.func_code = f.read()
+        f.close()
+        # Separate the contents of the file in sections and validate that at
+        # lest one of the necessary code sections has been defined
+        self.code_sections = self.parse_external_c_code(self.func_code)
+        if sum([marker in self.code_sections.keys()
+               for marker in self.c_code_markers]) == 0:
+            raise(RuntimeError, "The provided C implementation does not "
+                  "define a support code section or a support code apply "
+                  "section.")
+    def parse_external_c_code(self, code):
+        # Obtain the positions of the C code markers used in the C code
+        positions = [(code.index(marker), marker)
+                     for marker in self.c_code_markers if marker in code]
+        # Go over the markers in their order of occurence and extract
+        # the C code they concern
+        positions.sort()
+        code_sections = {}
+        for i in range(len(positions)):
+            marker_start, marker = positions[i]
+            if i < len(positions) - 1:
+                # This is not the last section in the code : extract the code
+                # between the beginning of the current marker and the
+                # beginning of the next one.
+                next_marker_start = positions[i+1][0]
+                section = code[marker_start: next_marker_start]
+            else:
+                # This is the last section in the code : extract the remaining
+                # C code
+                section = code[marker_start:]
+            cleaned_section = section.replace(marker, "")
+            code_sections[marker] = cleaned_section
+        return code_sections
+    def c_code_cache_version(self):
+        return hash(self.func_code)
+    def c_support_code(self):
+        if self.support_code_marker in self.code_sections:
+            return self.code_sections[self.support_code_marker]
+        else:
+            raise utils.MethodNotDefined("c_support_code",
+                type(self), self.__class__.__name__)
+    def c_support_code_apply(self, node, name):
+        if self.apply_code_marker in self.code_sections:
+            apply_code = self.code_sections[self.apply_code_marker]
+            if hasattr(self, 'check_inputs') and self.check_inputs == False:
+                return apply_code
+            else:
+                define_macros, undef_macros = self.get_c_macros(node, name)
+                return os.linesep.join([define_macros, apply_code,
+                                        undef_macros])
+        else:
+            raise utils.MethodNotDefined("c_support_code_apply",
+                type(self), self.__class__.__name__)
+    def format_c_function_args(self, inp, out):
+        # Generate an string containing the arguments sent to the external C
+        # function. The argstring will be of format :
+        # "input0, input1, input2, &output0, &output1"
+        return ", ".join(list(inp) + ["&%s" % o for o in out])
+    def get_c_macros(self, node, name):
+        define_template = "#define %s %s" + os.linesep
+        undef_template = "#undef %s" + os.linesep
+        define_macros = ""
+        undef_macros = ""
+        # Extract the various properties of the input and output variables
+        variables = node.inputs + node.outputs
+        variable_names = (["INPUT_%i" % i for i in range(len(node.inputs))] +
+                          ["OUTPUT_%i" % i for i in range(len(node.inputs))])
+        variable_dtypes_names = [v.dtype for v in variables]
+        variable_dtypes = [numpy.dtype(d) for d in variable_dtypes_names]
+        variable_typenums = [d.num for d in variable_dtypes]
+        variable_itemsizes = [d.itemsize for d in variable_dtypes]
+        # Generate dtype macros
+        for i in range(len(variables)):
+            macro_name = "DTYPE_" + variable_names[i]
+            macro_value = "npy_" + variable_dtypes_names[i]
+            define_macros += define_template % (macro_name, macro_value)
+            undef_macros += undef_template % macro_name
+        # Generate typenum macros
+        for i in range(len(variables)):
+            macro_name = "TYPENUM_" + variable_names[i]
+            macro_value = variable_typenums[i]
+            define_macros += define_template % (macro_name, macro_value)
+            undef_macros += undef_template % macro_name
+        # Generate itemsize macros
+        for i in range(len(variables)):
+            macro_name = "ITEMSIZE_" + variable_names[i]
+            macro_value = variable_itemsizes[i]
+            define_macros += define_template % (macro_name, macro_value)
+            undef_macros += undef_template % macro_name
+        # Generate a macro to mark code as being apply-specific
+        define_macros += define_template % ("APPLY_SPECIFIC(str)",
+                                            "str##_%s" % name)
+        undef_macros += undef_template % "APPLY_SPECIFIC"
+        return define_macros, undef_macros
+    def c_code(self, node, name, inp, out, sub):
+        func_name = self.func_name
+        func_args = self.format_c_function_args(inp, out)
+        fail = sub['fail']
+        # Generate the code to define/undefine the C macros
+        define_macros, undef_macros = self.get_c_macros(node, name)
+        # Generate the C code
+        c_code = """
+        %(define_macros)s
+        {
+            int result = %(func_name)s(%(func_args)s);
+            if (result != 0)
+            {
+                %(fail)s;
+            }
+        }
+        %(undef_macros)s
+        """ % locals()
+        return c_code
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -22,8 +22,6 @@ import theano
 from theano import config
 from theano.gof.python25 import any, all, deque
-#if sys.version_info[:2] >= (2,5):
-#  from collections import defaultdict
 _logger = logging.getLogger('theano.gof.opt')
@@ -154,7 +152,7 @@ def inplace_optimizer(f):
 class SeqOptimizer(Optimizer, list):
-    #inherit from Optimizer first to get Optimizer.__hash__
+    # inherit from Optimizer first to get Optimizer.__hash__
    """WRITEME
    Takes a list of L{Optimizer} instances and applies them
    sequentially.
@@ -825,6 +823,68 @@ class LocalOptimizer(object):
                (' ' * level), self.__class__.__name__, id(self))
+class LocalSeqOptimizer(LocalOptimizer, list):
+    """
+    This allow to try a group of local optimizer in sequence.
+    When one do something, we return without trying the following one.
+    """
+    # inherit from Optimizer first to get Optimizer.__hash__
+    def __init__(self, *opts, **kw):
+        """WRITEME"""
+        if len(opts) == 1 and isinstance(opts[0], (list, tuple)):
+            opts = opts[0]
+        self[:] = opts
+        self.failure_callback = kw.pop('failure_callback', None)
+    def tracks(self):
+        t = []
+        for l in self:
+            tt = l.tracks()
+            if tt:
+                t.extend(tt)
+        return t
+    def transform(self, node):
+        """Transform a subgraph whose output is `node`.
+        Subclasses should implement this function so that it returns one of two
+        kinds of things:
+        - False to indicate that no optimization can be applied to this `node`;
+          or
+        - <list of variables> to use in place of `node`'s outputs in the
+          greater graph.
+        - dict(old variables -> new variables). A dictionary that map
+          from old variables to new variables to replace.
+        :type node: an Apply instance
+        """
+        for l in self:
+            ret = l.transform(node)
+            if ret:
+                return ret
+    def add_requirements(self, fgraph):
+        """
+        If this local optimization wants to add some requirements to the
+        fgraph,
+        This is the place to do it.
+        """
+        for l in self:
+            l.add_requirements(fgraph)
+    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
+        name = getattr(self, 'name', None)
+        print >> stream, "%s%s %s id=%i" % (
+            (' ' * level), self.__class__.__name__, name, id(self))
+        # This way, -1 will do all depth
+        if depth != 0:
+            depth -= 1
+            for opt in self:
+                opt.print_summary(stream, level=(level + 2), depth=depth)
 class FromFunctionLocalOptimizer(LocalOptimizer):
    """WRITEME"""
    def __init__(self, fn, tracks=None, requirements=()):
@@ -1241,6 +1301,30 @@ class PatternSub(LocalOptimizer):
 # Use the following classes to apply LocalOptimizers
+class Updater:
+    def __init__(self, importer, pruner, chin):
+        self.importer = importer
+        self.pruner = pruner
+        self.chin = chin
+    def on_import(self, fgraph, node, reason):
+        if self.importer:
+            self.importer(node)
+    def on_prune(self, fgraph, node, reason):
+        if self.pruner:
+            self.pruner(node)
+    def on_change_input(self, fgraph, node, i, r, new_r, reason):
+        if self.chin:
+            self.chin(node, i, r, new_r, reason)
+    def on_detach(self, fgraph):
+        # To allow pickling this object
+        self.importer = None
+        self.pruner = None
+        self.chin = None
 class NavigatorOptimizer(Optimizer):
    """Abstract class
@@ -1329,18 +1413,7 @@ class NavigatorOptimizer(Optimizer):
        if importer is None and pruner is None:
            return None
-        class Updater:
+        u = Updater(importer, pruner, chin)
-            if importer is not None:
-                def on_import(self, fgraph, node, reason):
-                    importer(node)
-            if pruner is not None:
-                def on_prune(self, fgraph, node, reason):
-                    pruner(node)
-            if chin is not None:
-                def on_change_input(self, fgraph, node, i, r, new_r, reason):
-                    chin(node, i, r, new_r, reason)
-        u = Updater()
        fgraph.attach_feature(u)
        return u

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -223,6 +223,7 @@ class SequenceDB(DB):
    other tags) fast_run and fast_compile optimizers are drawn is a SequenceDB.
    """
+    seq_opt = opt.SeqOptimizer
    def __init__(self, failure_callback=opt.SeqOptimizer.warn):
        super(SequenceDB, self).__init__()
@@ -256,13 +257,13 @@ class SequenceDB(DB):
        # the order we want.
        opts.sort(key=lambda obj: obj.name)
        opts.sort(key=lambda obj: self.__position__[obj.name])
-        ret = opt.SeqOptimizer(opts, failure_callback=self.failure_callback)
+        ret = self.seq_opt(opts, failure_callback=self.failure_callback)
        if hasattr(tags[0], 'name'):
            ret.name = tags[0].name
        return ret
    def print_summary(self, stream=sys.stdout):
-        print >> stream, "SequenceDB (id %i)" % id(self)
+        print >> stream, self.__class__.__name__ + " (id %i)" % id(self)
        positions = self.__position__.items()
        def c(a, b):
@@ -279,6 +280,13 @@ class SequenceDB(DB):
        return sio.getvalue()
+class LocalSequenceDB(SequenceDB):
+    """
+    This generate a local optimizer instead of a global optimizer.
+    """
+    seq_opt = opt.LocalSeqOptimizer
 class ProxyDB(DB):
    """
    Wrap an existing proxy.

--- a/theano/gof/tests/test_fg.py
+++ b/theano/gof/tests/test_fg.py
+import pickle
 import unittest
 import theano
 from theano.gof import CachedConstantError, FunctionGraph
+from theano import tensor as tt
 class TFunctionGraph(unittest.TestCase):
@@ -15,3 +17,10 @@ class TFunctionGraph(unittest.TestCase):
        v = theano.tensor.constant(1)
        assert v.cached
        FunctionGraph([], [v + 1])
+    def test_pickle(self):
+        v = tt.vector()
+        func = theano.gof.FunctionGraph([v], [v + 1])
+        s = pickle.dumps(func)
+        func2 = pickle.loads(s)
--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
@@ -20,6 +20,7 @@ from theano import tensor
 from theano.ifelse import ifelse
 import theano
 class TestCallbacks(unittest.TestCase):
    """
    Test the VM_Linker's callback argument, which can be useful for debugging.
@@ -34,7 +35,7 @@ class TestCallbacks(unittest.TestCase):
    def test_callback(self):
        a, b, c = tensor.scalars('abc')
-        f = function([a,b,c], (a + b) + c,
+        f = function([a, b, c], (a + b) + c,
                mode=Mode(
                    optimizer=None,
                    linker=vm.VM_Linker(callback=self.callback)))
@@ -44,13 +45,12 @@ class TestCallbacks(unittest.TestCase):
        f(1, 2, 3)
        assert sum(self.n_callbacks.values()) == len(f.maker.fgraph.toposort()) * 2
    def test_callback_with_ifelse(self):
        a, b, c = tensor.scalars('abc')
-        f = function([a,b,c], ifelse(a, 2*b, 2*c),
+        f = function([a, b, c], ifelse(a, 2*b, 2*c),
-                mode=Mode(
+                     mode=Mode(
-                    optimizer=None,
+                         optimizer=None,
-                    linker=vm.VM_Linker(callback=self.callback)))
+                         linker=vm.VM_Linker(callback=self.callback)))
        f(1, 2, 3)
        assert self.n_callbacks['IfElse'] == 2
@@ -71,6 +71,7 @@ def test_speed():
        for d in xrange(depth):
            z = (z+z)
        return z
    def time_numpy():
        steps_a = 5
        steps_b = 100
@@ -78,10 +79,10 @@ def test_speed():
        numpy_version(x, steps_a)
        t0 = time.time()
-        #print numpy_version(x, steps_a)
+        # print numpy_version(x, steps_a)
        t1 = time.time()
        t2 = time.time()
-        #print numpy_version(x, steps_b)
+        # print numpy_version(x, steps_b)
        t3 = time.time()
        t_a = t1 - t0
        t_b = t3 - t2
@@ -94,18 +95,17 @@ def test_speed():
        steps_a = 5
        steps_b = 100
        x = tensor.vector()
-        a = build_graph(x,steps_a)
+        a = build_graph(x, steps_a)
-        b = build_graph(x,steps_b)
+        b = build_graph(x, steps_b)
        f_a = function([x], a,
-                mode=Mode(optimizer=None, linker=linker()),
+                       mode=Mode(optimizer=None, linker=linker()),
-                #profile='f_a speed test %s'%name,
+                       #profile='f_a speed test %s'%name,
-                )
+        )
        f_b = function([x], b,
-                mode=Mode(optimizer=None, linker=linker()),
+                       mode=Mode(optimizer=None, linker=linker()),
-                #profile='f_b speed test %s'%name,
+                       #profile='f_b speed test %s'%name,
-                )
+        )
        f_a([2.0, 3.0])
        t0 = time.time()
@@ -122,17 +122,18 @@ def test_speed():
        t_b = t3 - t2
        print "%s takes %f s/Kop" % (
-                name,
+            name,
-                (1000*(t_b-t_a) / (steps_b - steps_a)))
+            (1000*(t_b-t_a) / (steps_b - steps_a)))
    time_linker('c|py', OpWiseCLinker)
    time_linker('vmLinker', vm.VM_Linker)
-    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_nogc', lambda: vm.VM_Linker(allow_gc=False))
    if theano.config.cxx:
-        time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
+        time_linker('vmLinker_CLOOP', lambda: vm.VM_Linker(allow_gc=False,
-                                                            use_cloop=True))
+                                                           use_cloop=True))
    time_numpy()
 def test_speed_lazy():
    def build_graph(x, depth=5):
@@ -148,17 +149,16 @@ def test_speed_lazy():
        a = build_graph(x, steps_a)
        b = build_graph(x, steps_b)
        f_a = function([x], a,
-                mode=Mode(optimizer=None,
+                       mode=Mode(optimizer=None,
-                    linker=linker()),
+                                 linker=linker()),
-                #profile='f_a lazy ifelse %s'%name,
+                       #profile='f_a lazy ifelse %s'%name,
-                )
+        )
        f_b = function([x], b,
-                mode=Mode(optimizer=None,
+                       mode=Mode(optimizer=None,
-                    linker=linker()),
+                                 linker=linker()),
-                #profile='f_b lazy ifelse %s'%name,
+                       #profile='f_b lazy ifelse %s'%name,
-                )
+        )
        f_a([2.0])
        t0 = time.time()
@@ -179,15 +179,20 @@ def test_speed_lazy():
                (1000*(t_b-t_a) / (steps_b - steps_a)))
    time_linker('vmLinker', vm.VM_Linker)
-    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_nogc', lambda: vm.VM_Linker(allow_gc=False))
    if theano.config.cxx:
-        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
+        time_linker('vmLinker_C', lambda: vm.VM_Linker(allow_gc=False,
-                                                        use_cloop=True))
+                                                       use_cloop=True))
 def test_allow_gc_cvm():
+    mode = theano.config.mode
+    if mode in ['DEBUG_MODE', 'DebugMode']:
+        mode = "FAST_RUN"
    v = theano.tensor.vector()
-    f = theano.function([v], v + 1)
+    f = theano.function([v], v + 1, mode=mode)
    f([1])
    n = list(f.maker.fgraph.apply_nodes)[0].outputs[0]
    assert f.fn.storage_map[n][0] is None
@@ -262,8 +267,8 @@ if run_memory_usage_tests:
            a = build_graph(x, steps_a)
            f_a = function([x], a,
-                    mode=Mode(optimizer=None,
+                           mode=Mode(optimizer=None,
-                        linker=linker()))
+                                     linker=linker()))
            for i in xrange(100000):
                f_a([2.0])
@@ -296,8 +301,8 @@ if run_memory_usage_tests:
            a = build_graph(x, steps_a)
            f_a = function([x], a,
-                    mode=Mode(optimizer=None,
+                           mode=Mode(optimizer=None,
-                        linker=linker()))
+                                     linker=linker()))
            for i in xrange(500000):
                f_a([2.0])

--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
@@ -104,7 +104,32 @@ class Bookkeeper(Feature):
            self.on_prune(fgraph, node, 'Bookkeeper.detach')
+class GetCheckpoint:
+    def __init__(self, history, fgraph):
+        self.h = history
+        self.fgraph = fgraph
+    def __call__(self):
+        return len(self.h.history[self.fgraph])
+class LambdExtract:
+    def __init__(self, fgraph, node, i, r, reason=None):
+        self.fgraph = fgraph
+        self.node = node
+        self.i = i
+        self.r = r
+        self.reason = reason
+    def __call__(self):
+        return self.fgraph.change_input(self.node, self.i, self.r,
+                                    reason=("Revert", self.reason))
 class History(Feature):
+    pickle_rm_attr = ["checkpoint", "revert"]
    def __init__(self):
        self.history = {}
@@ -114,7 +139,14 @@ class History(Feature):
            raise AlreadyThere("History feature is already present or in"
                               " conflict with another plugin.")
        self.history[fgraph] = []
-        fgraph.checkpoint = lambda: len(self.history[fgraph])
+        # Don't call unpickle here, as ReplaceValidate.on_attach()
+        # call to History.on_attach() will call the
+        # ReplaceValidate.unpickle and not History.unpickle
+        fgraph.checkpoint = GetCheckpoint(self, fgraph)
+        fgraph.revert = partial(self.revert, fgraph)
+    def unpickle(self, fgraph):
+        fgraph.checkpoint = GetCheckpoint(self, fgraph)
        fgraph.revert = partial(self.revert, fgraph)
    def on_detach(self, fgraph):
@@ -126,8 +158,7 @@ class History(Feature):
        if self.history[fgraph] is None:
            return
        h = self.history[fgraph]
-        h.append(lambda: fgraph.change_input(node, i, r,
+        h.append(LambdExtract(fgraph, node, i, r, reason))
-                                          reason=("Revert", reason)))
    def revert(self, fgraph, checkpoint):
        """
@@ -144,47 +175,66 @@ class History(Feature):
 class Validator(Feature):
+    pickle_rm_attr = ["validate", "consistent"]
    def on_attach(self, fgraph):
        for attr in ('validate', 'validate_time'):
            if hasattr(fgraph, attr):
                raise AlreadyThere("Validator feature is already present or in"
                                   " conflict with another plugin.")
+        # Don't call unpickle here, as ReplaceValidate.on_attach()
+        # call to History.on_attach() will call the
+        # ReplaceValidate.unpickle and not History.unpickle
+        fgraph.validate = partial(self.validate_, fgraph)
+        fgraph.consistent = partial(self.consistent_, fgraph)
-        def validate():
+    def unpickle(self, fgraph):
-            t0 = time.time()
+        fgraph.validate = partial(self.validate_, fgraph)
-            ret = fgraph.execute_callbacks('validate')
+        fgraph.consistent = partial(self.consistent_, fgraph)
-            t1 = time.time()
-            if fgraph.profile:
-                fgraph.profile.validate_time += t1 - t0
-            return ret
-        fgraph.validate = validate
-        def consistent():
-            try:
-                fgraph.validate()
-                return True
-            except Exception:
-                return False
-        fgraph.consistent = consistent
    def on_detach(self, fgraph):
        del fgraph.validate
        del fgraph.consistent
+    def validate_(self, fgraph):
+        t0 = time.time()
+        ret = fgraph.execute_callbacks('validate')
+        t1 = time.time()
+        if fgraph.profile:
+            fgraph.profile.validate_time += t1 - t0
+        return ret
+    def consistent_(self, fgraph):
+        try:
+            fgraph.validate()
+            return True
+        except Exception:
+            return False
 class ReplaceValidate(History, Validator):
+    pickle_rm_attr = ["replace_validate", "replace_all_validate",
+                      "replace_all_validate_remove"] + \
+                      History.pickle_rm_attr + Validator.pickle_rm_attr
    def on_attach(self, fgraph):
-        History.on_attach(self, fgraph)
+        for attr in ('replace_validate', 'replace_all_validate',
-        Validator.on_attach(self, fgraph)
+                     'replace_all_validate_remove'):
-        for attr in ('replace_validate', 'replace_all_validate'):
            if hasattr(fgraph, attr):
                raise AlreadyThere("ReplaceValidate feature is already present"
                                   " or in conflict with another plugin.")
+        History.on_attach(self, fgraph)
+        Validator.on_attach(self, fgraph)
+        self.unpickle(fgraph)
+    def unpickle(self, fgraph):
+        History.unpickle(self, fgraph)
+        Validator.unpickle(self, fgraph)
        fgraph.replace_validate = partial(self.replace_validate, fgraph)
-        fgraph.replace_all_validate = partial(self.replace_all_validate, fgraph)
+        fgraph.replace_all_validate = partial(self.replace_all_validate,
+                                              fgraph)
        fgraph.replace_all_validate_remove = partial(
            self.replace_all_validate_remove, fgraph)
@@ -247,6 +297,12 @@ class ReplaceValidate(History, Validator):
                    print >> out, reason, replacements
                raise ReplacementDidntRemovedError()
+    def __getstate__(self):
+        d = self.__dict__.copy()
+        if "history" in d:
+            del d["history"]
+        return d
 class NodeFinder(Bookkeeper):

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -694,7 +694,7 @@ class VM_Linker(link.LocalLinker):
            if k.owner and k.clients:
                ls = []
                for cl in k.clients:
-                    if cl[0] is not 'output':
+                    if cl[0] != 'output':
                        ls += cl[0].outputs
                dependencies[k] += ls
        return dependencies
@@ -924,7 +924,7 @@ class VM_Linker(link.LocalLinker):
                self.updated_vars
                )
-	vm.storage_map = storage_map
+        vm.storage_map = storage_map
        return (vm,
                [link.Container(input, storage)

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -356,9 +356,21 @@ def grad(cost, wrt, consider_constant=None,
         disconnected_inputs='raise', add_names=True,
         known_grads=None, return_disconnected='zero'):
    """
-    :type cost: Scalar (0-dimensional) Variable.
+    Return symbolic gradients for one or more variables with respect to some
+    cost.
+    For more information about how automatic differentiation works in Theano,
+    see :mod:`gradient`. For information on how to implement the gradient of
+    a certain Op, see :func:`grad`.
+    :type cost: Scalar (0-dimensional) tensor variable.
        May optionally be None if known_grads is provided.
-    :type wrt: Variable or list of Variables.
+    :param cost: a scalar with respect to which we are differentiating
+    :type wrt: Tensor variable or list of variables.
+    :param wrt: term[s] for which we want gradients
+    :type consider_constant: list of variables
    :param consider_constant: a list of expressions not to backpropagate
        through
@@ -389,9 +401,10 @@ def grad(cost, wrt, consider_constant=None,
                   None
        - 'Disconnected' : returns variables of type DisconnectedType
-    :rtype: Variable or list/tuple of Variables (depending upon `wrt`)
+    :rtype: variable or list/tuple of Variables (matching `wrt`)
-    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
+    :return: symbolic expression of gradient of `cost` with respect to each
+             of the `wrt` terms.
             If an element of `wrt` is not differentiable with respect
             to the output, then a zero variable is returned.
             It returns an object of same type as `wrt`: a list/tuple
@@ -567,6 +580,33 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    subgraph_grad as `start` with any other `cost` (e.g. weight
    decay).
+    In an MLP, we could use subgraph_grad to iteratively backpropagate:
+    .. code-block:: python
+        x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
+        w1 = theano.shared(np.random.randn(3,4))
+        w2 = theano.shared(np.random.randn(4,2))
+        a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+        a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+        cost2 = theano.tensor.sqr(a2 - t).sum()
+        cost2 += theano.tensor.sqr(w2.sum())
+        cost1 = theano.tensor.sqr(w1.sum())
+        params = [[w2],[w1]]
+        costs = [cost2,cost1]
+        grad_ends = [[a1], [x]]
+        next_grad = None
+        param_grads = []
+        for i in xrange(2):
+            param_grad, next_grad = theano.subgraph_grad(
+                wrt=params[i], end=grad_ends[i],
+                start=next_grad, cost=costs[i]
+            )
+            next_grad = dict(zip(grad_ends[i], next_grad))
+            param_grads.extend(param_grad)
    :type wrt: list of variables
    :param wrt:
      Gradients are computed with respect to `wrt`.
@@ -593,7 +633,14 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
      : If the gradients of `cost` with respect to any of the `start`
      variables is already part of the `start` dictionary, then it may
      be counted twice with respect to `wrt` and `end`.
+      .. warning::
+        If the gradients of `cost` with respect to any of the `start`
+        variables is already part of the `start` dictionary, then it
+        may be counted twice with respect to `wrt` and `end`.
    :type details: bool
    :param details:
      When True, additionally returns the list of gradients from
@@ -605,6 +652,7 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    :return: Returns lists of gradients with respect to `wrt` and `end`, 
            respectively.
+    .. versionadded:: 0.6.1
    '''
    assert ((cost is not None) or (start is not None))
    assert isinstance(end, list)

--- a/theano/ifelse.py
+++ b/theano/ifelse.py
@@ -435,7 +435,7 @@ where, each of the optimization do the following things:
 acceptable_ops = (theano.tensor.basic.Dot,
                  theano.tensor.basic.Reshape,
                  theano.tensor.basic.Shape,
-                  theano.tensor.basic.SpecifyShape,
+                  theano.tensor.SpecifyShape,
                  theano.tensor.basic.MaxAndArgmax,
                  theano.tensor.Subtensor,
                  theano.tensor.IncSubtensor,

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -201,41 +201,43 @@ if __name__ == "__main__":
        Test time in float32
-        cuda version      6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
+        cuda version      6.5    6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
-        K6000/NOECC              0.06s
+        K6000/NOECC                     0.06s
-        K40                      0.07s
+        K40                             0.07s
-        K20m/ECC                        0.07s
+        K20m/ECC                               0.07s
-        K20/NOECC                       0.07s
+        K20/NOECC                              0.07s
-        M2090                    0.19s
+        M2090                           0.19s
-        C2075                                  0.25s
+        C2075                                         0.25s
-        M2075                           0.25s
+        M2075                                  0.25s
-        M2070                           0.25s         0.27s         0.32s
+        M2070                                  0.25s         0.27s         0.32s
-        M2070-Q                         0.48s         0.27s         0.32s
+        M2070-Q                                0.48s         0.27s         0.32s
-        M2050(Amazon)                   0.25s
+        M2050(Amazon)                          0.25s
-        C1060                                                       0.46s
+        C1060                                                              0.46s
-        K600                     1.04s
+        K600                            1.04s
-        GTX Titan Black          0.05s
+        GTX Titan Black                 0.05s
-        GTX Titan(D15U-50)       0.06s  0.06s  don't work
+        GTX Titan(D15U-50)              0.06s  0.06s  don't work
-        GTX 780                  0.06s
+        GTX 780                         0.06s
-        GTX 680                  0.11s  0.12s  0.154s               0.218s
+        GTX 970           0.08s
-        GTX 580                  0.16s  0.16s  0.164s               0.203s
+        GTX 680                         0.11s  0.12s  0.154s               0.218s
-        GTX 480                  0.19s  0.19s  0.192s               0.237s 0.27s
+        GTX 580                         0.16s  0.16s  0.164s               0.203s
-        GTX 470                  0.23s  0.23s  0.238s               0.297s 0.34s
+        GTX 480                         0.19s  0.19s  0.192s               0.237s 0.27s
-        GTX 660                  0.18s  0.20s  0.23s
+        GTX 750 Ti        0.20s
-        GTX 560                                0.30s
+        GTX 470                         0.23s  0.23s  0.238s               0.297s 0.34s
-        GTX 650 Ti                      0.27s
+        GTX 660                         0.18s  0.20s  0.23s
-        GTX 765M          0.27s
+        GTX 560                                       0.30s
-        GTX 460                         0.37s                0.45s
+        GTX 650 Ti                             0.27s
-        GTX 285                  0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
+        GTX 765M                 0.27s
-        750M                            0.49s
+        GTX 460                                0.37s                0.45s
-        GTX 550 Ti                                           0.57s
+        GTX 285                         0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
-        GT 520                                 2.68s                3.06s
+        750M                                   0.49s
-        520M                            2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
+        GTX 550 Ti                                                  0.57s
-        GT 220                                                      3.80s
+        GT 520                                        2.68s                3.06s
-        GT 210                                               6.35s
+        520M                                   2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
-        8500 GT                                                            10.68s
+        GT 220                                                             3.80s
+        GT 210                                                      6.35s
+        8500 GT                                                                   10.68s
        """
    t, impl = execute(not options.print_only, not options.quiet,

--- a/theano/misc/ordered_set.py
+++ b/theano/misc/ordered_set.py
@@ -44,6 +44,8 @@ if MutableSet is not None:
    import weakref
    class Link(object):
+        # This make that we need to use a different pickle protocol
+        # then the default.  Othewise, there is pickling errors
        __slots__ = 'prev', 'next', 'key', '__weakref__'
        def __getstate__(self):

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -102,10 +102,38 @@ def debugprint(obj, depth=-1, print_type=False,
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)
+    scan_ops = []
    for r in results_to_print:
+        #Add the parent scan op to the list as well
+        if hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan):
+            scan_ops.append(r)
        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
                             file=_file, order=order, ids=ids,
-                             stop_on_name=stop_on_name)
+                             scan_ops=scan_ops, stop_on_name=stop_on_name)
+    if len(scan_ops) > 0:
+        print >> file, ""
+        new_prefix = ' >'
+        new_prefix_child = ' >'
+        print >> file, "Inner graphs of the scan ops:"
+        for s in scan_ops:
+            print >> file, ""
+            debugmode.debugprint(s, depth=depth, done=done, print_type=print_type,
+                                 file=_file, ids=ids,
+                                 scan_ops=scan_ops, stop_on_name=stop_on_name)
+            for idx, i in enumerate(s.owner.op.outputs):
+                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
+                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
+                        scan_ops.append(i)
+                debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done,
+                                     print_type=print_type, file=file,
+                                     ids=ids, stop_on_name=stop_on_name,
+                                     prefix_child=new_prefix_child, scan_ops=scan_ops)
    if file is _file:
        return file
    elif file == 'str':
@@ -964,7 +992,7 @@ def pydotprint_variables(vars,
        if nd.owner:
            plot_apply(nd.owner, depth)
    try:
-        g.write_png(outfile, prog='dot')
+        g.write(outfile, prog='dot', format=format)
    except pd.InvocationException, e:
        # Some version of pydot are bugged/don't work correctly with
        # empty label. Provide a better user error message.
@@ -978,6 +1006,7 @@ def pydotprint_variables(vars,
                            " Theano. Using another version of pydot could"
                            " fix this problem. The pydot error is: " +
                            e.message)
+        raise
    print 'The output file is available at', outfile

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -3025,15 +3025,17 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
 {
    int *gpu_data = (int*)device_malloc(sizeof(int)*2);
    if(gpu_data == NULL){
-        return PyErr_Format(PyExc_MemoryError,
+        return NULL;
-                            "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
    }
    get_gpu_ptr_size<<<1,1>>>(gpu_data);
-    if (cudaSuccess != cudaGetLastError()){
+    cudaError_t cudaErr = cudaGetLastError();
+    if (cudaSuccess != cudaErr){
        device_free(gpu_data);
        return PyErr_Format(PyExc_RuntimeError,
-                            "CudaNdarray_ptr_int_size: error when calling the gpu code.");
+                            "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
+                            cudaGetErrorString(cudaErr));
    }
    // Transfer the result to cpu

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -586,6 +586,31 @@ def test_dnn_valid():
        yield t
+def test_default_conv():
+    """Just test that we introduce the right GPU convolution
+    version.
+    """
+    img = theano.tensor.ftensor4()
+    fil = theano.tensor.ftensor4()
+    c = theano.tensor.nnet.conv2d(img, fil)
+    f = theano.function([img, fil], c, mode=theano_mode)
+    if cuda.dnn.dnn_available():
+        assert any([isinstance(a.op, GpuDnnConv)
+                    for a in f.maker.fgraph.apply_nodes])
+    else:
+        assert any([isinstance(a.op, cuda.blas.GpuCorrMM)
+                    for a in f.maker.fgraph.apply_nodes])
+    mode = theano_mode.excluding('local_gpu_conv', 'local_conv_gemm')
+    f = theano.function([img, fil], c, mode=mode)
+    assert any([isinstance(a.op, cuda.blas.GpuConv)
+                for a in f.maker.fgraph.apply_nodes])
 def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
    seed_rng()
    shapes = get_basic_shapes()
@@ -722,6 +747,10 @@ def test_dnn_subsample():
 class TestConv2DGPU(unittest.TestCase):
+    conv_ops = (cuda.blas.GpuConv,
+                cuda.dnn.GpuDnnConvBase,
+                cuda.blas.BaseGpuCorrMM)
    def test_logical_shapes(self):
        seed_rng()
        for stride in range(1, 4):
@@ -748,7 +777,7 @@ class TestConv2DGPU(unittest.TestCase):
            func = theano.function([a, A], image_estimate, mode=theano_mode)
            #theano.printing.debugprint(func,)
-            assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv)
+            assert any([isinstance(node.op, self.conv_ops)
                        for node in func.maker.fgraph.toposort()])
            a_in = numpy.random.randn(*featshp).astype("float32")

--- a/theano/sandbox/cuda/tests/test_fftconv.py
+++ b/theano/sandbox/cuda/tests/test_fftconv.py
@@ -83,7 +83,7 @@ class TestConv2dFFT(unittest.TestCase):
        # make sure we inserted the fft trickery
        topo = f_fft.maker.fgraph.toposort()
        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2
+                   for n in topo) == 2, topo
        res_ref = f_ref()
@@ -112,7 +112,7 @@ class TestConv2dFFT(unittest.TestCase):
        # make sure we inserted the fft trickery
        topo = f_fft.maker.fgraph.toposort()
        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
-                   for n in topo) == 2
+                   for n in topo) == 2, topo
        res_ref = f_ref()
        res_fft = f_fft()

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -396,7 +396,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
    if use_gpu:
        # Check that GpuConv is used
        topo = train.maker.fgraph.toposort()
-        assert len([n for n in topo if isinstance(n.op, tcn.blas.GpuConv)]) > 0
+        conv_ops = (tcn.blas.GpuConv,
+                    tcn.dnn.GpuDnnConvBase,
+                    tcn.blas.BaseGpuCorrMM)
+        assert len([n for n in topo if isinstance(n.op, conv_ops)]) > 0
    shape_target = (n_batch, n_out)
    return train, params, shape_img, shape_target, mode

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -78,13 +78,17 @@ def safe_to_cpu(x):
        return x
-def op_lifter(OP):
+def op_lifter(OP, cuda_only=False):
    """
    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
    """
    def f(maker):
        def local_opt(node):
+            dev = theano.sandbox.gpuarray.init_dev.device
+            if cuda_only and not dev.startswith('cuda'):
+                return
            if type(node.op) in OP:
                # Either one of our inputs is on the gpu or
@@ -484,25 +488,25 @@ def local_gpua_eye(node):
 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
+@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
 def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
    return GpuCrossentropySoftmaxArgmax1HotWithBias()
 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
+@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.Softmax])
+@op_lifter([tensor.nnet.Softmax], cuda_only=True)
 def local_gpua_softmax(node):
    return GpuSoftmax()
 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.SoftmaxWithBias])
+@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
 def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -734,6 +734,13 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
            unsigned int threads_per_block = std::min((unsigned int)n_streams_used_in_this_call, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
            unsigned int n_blocks = std::min(ceil_intdiv((unsigned int)n_streams_used_in_this_call, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
+            if (n_streams > (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS)
+            {
+                PyErr_Format(PyExc_ValueError, "On GPU, n_streams should be at most %%u",
+                    (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK * (unsigned int)NUM_VECTOR_OP_BLOCKS);
+                %(fail)s;
+            }
            if (threads_per_block * n_blocks < n_streams)
            {
                if (! %(nodename)s_printed_warning)
@@ -761,7 +768,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        """ % locals()
    def c_code_cache_version(self):
-        return (8,)
+        return (9,)
 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -17,6 +17,7 @@ import unittest
 from theano.tests import unittest_tools as utt
 from nose.plugins.skip import SkipTest
 from nose.plugins.attrib import attr
+from nose.tools import assert_raises
 #TODO: test gpu
 # Done in test_consistency_GPU_{serial,parallel}
@@ -306,6 +307,30 @@ def test_consistency_GPU_parallel():
    assert(numpy.allclose(samples, java_samples))
+def test_GPU_nstreams_limit():
+    """Verify that a ValueError is raised when n_streams
+    is greater than 2**20 on GPU. This is the value of
+    (NUM_VECTOR_OP_THREADS_PER_BLOCK * NUM_VECTOR_OP_BLOCKS).
+    """
+    if not cuda_available:
+        raise SkipTest('Optional package cuda not available')
+    seed = 12345
+    R = MRG_RandomStreams(seed=seed, use_cuda=True)
+    def eval_uniform(size, nstreams):
+        if theano.config.mode == "FAST_COMPILE":
+            mode = "FAST_RUN"
+        else:
+            mode = None
+        out = R.uniform(size=size, nstreams=nstreams, dtype='float32')
+        f = theano.function([], out, mode=mode)
+        return f()
+    eval_uniform((10,), 2**20)
+    assert_raises(ValueError, eval_uniform, (10,), 2**20 + 1)
 def test_consistency_GPUA_serial():
    '''Verify that the random numbers generated by GPUA_mrg_uniform, serially,
    are the same as the reference (Java) implementation by L'Ecuyer et al.

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -65,3 +65,6 @@ from theano.tensor.sort import sort, argsort
 from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
                       repeat, bartlett, fill_diagonal, fill_diagonal_offset,
                       cumsum, cumprod)
+# SpecifyShape is defined in theano.compile, but should be available in tensor
+from theano.compile import SpecifyShape, specify_shape
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1494,11 +1494,11 @@ class GemmOptimizer(Optimizer):
            callbacks_before = fgraph.execute_callbacks_times.copy()
            callback_before = fgraph.execute_callbacks_time
-        class Updater:
+        def on_import(new_node):
-            def on_import(self, fgraph, new_node, reason):
+            if new_node is not node:
-                if new_node is not node:
+                nodelist.append(new_node)
-                    nodelist.append(new_node)
-        u = Updater()
+        u = theano.gof.opt.Updater(on_import, None, None)
        fgraph.attach_feature(u)
        while did_something:
            nb_iter += 1

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -182,10 +182,20 @@ class DimShuffle(Op):
        input = as_tensor_variable(_input)
        ib = tuple(input.type.broadcastable)
        if not ib == self.input_broadcastable:
-            raise TypeError((
+            if len(ib) != len(self.input_broadcastable):
-                "The number of dimensions and/or broadcastable pattern of the "
+                raise TypeError((
-                "input is incorrect for this op. Expected %s, got %s."
+                    "The number of dimensions of the "
-                % (self.input_broadcastable, ib)))
+                    "input is incorrect for this op. Expected %s, got %s."
+                    % (self.input_broadcastable, ib)))
+            for expected, b in zip(self.input_broadcastable, ib):
+                if expected is True and b is False:
+                    raise TypeError((
+                        "The broadcastable pattern of the "
+                        "input is incorrect for this op. Expected %s, got %s."
+                        % (self.input_broadcastable, ib)))
+                #else, expected == b or expected is False and b is True
+                # Both case are good.
        ob = []
        for value in self.new_order:
            if value == 'x':

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -65,14 +65,20 @@ def make_constant(args):
    return tuple(map(conv, args))
-def get_idx_list(inputs, idx_list):
+def get_idx_list(inputs, idx_list, get_count=False):
    '''
    Given a list of inputs to the subtensor and its idx_list reorders
-    the inputs according to the idx list to get the right values
+    the inputs according to the idx list to get the right values.
+    If get_counts=True, instead returns the number of inputs consumed
+    during this process.
    '''
+    # The number of indices
+    n = len(inputs) - 1
    # The subtensor (or idx_list) does not depend on the inputs.
-    if len(inputs) == 1:
+    if n == 0:
        return tuple(idx_list)
    indices = list(reversed(list(inputs[1:])))
@@ -87,7 +93,10 @@ def get_idx_list(inputs, idx_list):
        else:
            return entry
    cdata = tuple(map(convert, idx_list))
-    return cdata
+    if get_count:
+        return n - len(indices)
+    else:
+        return cdata
 def get_canonical_form_slice(theslice, length):

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_gc.py
+++ b/theano/tensor/tests/test_gc.py
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
--- a/theano/tests/test_pickle_unpickle_theano_fn.py
+++ b/theano/tests/test_pickle_unpickle_theano_fn.py
--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
--- a/theano/typed_list/basic.py
+++ b/theano/typed_list/basic.py
--- a/theano/typed_list/tests/test_basic.py
+++ b/theano/typed_list/tests/test_basic.py