Make grad more general (A. Bergeron)

Gradient code is moved from tensor/tensor_grad.py to theano/gradient.py. This makes it work with sparse variables. This commit was originally written by Arnaud Bergeron. I re-authored it to avoid a big merge in repo history.

Make grad more general (A. Bergeron)
f9ca8f9d · Olivier Delalleau · 0e018bc4 · f9ca8f9d · f9ca8f9d · f9ca8f9d
--- a/theano/__init__.py
+++ b/theano/__init__.py
 """
-Theano is an optimizing compiler in Python, built to evaluate complicated expressions
-(especially matrix-valued ones) as quickly as possible.
-Theano compiles expression graphs (see :doc:`graph` ) that are built by Python code.
-The expressions in these graphs are called `Apply` nodes and the variables in these graphs are called `Variable` nodes.
-
-You compile a graph by calling `function`, which takes a graph, and returns a callable object.
-One of theano's most important features is that `function` can transform your graph before
-compiling it.
-It can replace simple expressions with faster or more numerically stable implementations.
+Theano is an optimizing compiler in Python, built to evaluate
+complicated expressions (especially matrix-valued ones) as quickly as
+possible.  Theano compiles expression graphs (see :doc:`graph` ) that
+are built by Python code. The expressions in these graphs are called
+`Apply` nodes and the variables in these graphs are called `Variable`
+nodes.
+
+You compile a graph by calling `function`, which takes a graph, and
+returns a callable object.  One of theano's most important features is
+that `function` can transform your graph before compiling it.  It can
+replace simple expressions with faster or more numerically stable
+implementations.

 To learn more, check out:

@@ -37,7 +40,8 @@ logging_default_handler.setFormatter(logging_default_formatter)
 theano_logger.addHandler(logging_default_handler)
 theano_logger.setLevel(logging.WARNING)

-import configparser, configdefaults
+import configparser
+import configdefaults

 config = configparser.TheanoConfigParser()

@@ -87,8 +91,10 @@ from updates import Updates

 import tensor
 import scalar
-#import sparse #we don't import by default as we don't want to force having scipy installed.
+#we don't import by default as we don't want to force having scipy installed.
+#import sparse
 import gradient
+from gradient import Rop, Lop, grad
 import gof

 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
@@ -126,8 +132,10 @@ del _all, _divide, _over, _under, _invalid

 ## import scalar_opt

-### This is defined here because it is designed to work across symbolic datatypes
-#   (Sparse and Tensor)
+### This is defined here because it is designed to work across symbolic
+#   datatypes (Sparse and Tensor)
+
+
 def dot(l, r):
    """Return a symbolic matrix/dot product between l and r """
    rval = NotImplemented
@@ -144,5 +152,6 @@ def dot(l, r):
        except Exception, e1:
            rval = NotImplemented
    if rval == NotImplemented:
-        raise NotImplementedError("Dot failed for the following reasons:", (e0, e1))
+        raise NotImplementedError("Dot failed for the following reasons:",
+                                  (e0, e1))
    return rval
--- a/theano/gradient.py
+++ b/theano/gradient.py
 """Driver for gradient calculations."""

-__authors__   = "James Bergstra"
+__authors__ = "James Bergstra, Razvan Pascanu, Arnaud Bergeron"
 __copyright__ = "(c) 2011, Universite de Montreal"
 __license__ = "3-clause BSD License"
 __contact__ = "theano-dev <theano-dev@googlegroups.com>"

 __docformat__ = "restructuredtext en"

+import __builtin__
 import logging
+import warnings
 _logger = logging.getLogger('theano.gradient')
 import sys

-import numpy #for numeric_grad
+import numpy  # for numeric_grad

-import gof #, gof.variable
-from gof.python25 import all
-import gof.utils
+import theano
+from theano.raise_op import Raise

-from raise_op import Raise
+from theano import gof
+from theano.gof import Variable
+from theano.gof.python25 import all
+import theano.gof.utils

 _msg_retType = 'op.grad(...) returned a non-list'
 _msg_badlen = 'op.grad(...) returned wrong number of gradients'

+
+def format_as(use_list, use_tuple, outputs):
+    """
+    Formats the outputs according to the flags `use_list` and `use_tuple`.
+    If `use_list` is True, `outputs` is returned as a list (if `outputs`
+    is not a list or a tuple then it is converted in a one element list).
+    If `use_tuple` is True, `outputs` is returned as a tuple (if `outputs`
+    is not a list or a tuple then it is converted into a one element tuple).
+    Otherwise (if both flags are false), `outputs` is returned.
+    """
+    assert not (use_list and use_tuple), \
+        "Both flags cannot be simultaneously True"
+    if (use_list or use_tuple) and not isinstance(outputs, (list, tuple)):
+        if use_list:
+            return [outputs]
+        else:
+            return (outputs,)
+    elif not (use_list or use_tuple) and isinstance(outputs, (list, tuple)):
+        assert len(outputs) == 1, \
+            "Wrong arguments. Expected a one element list"
+        return outputs[0]
+    elif use_list or use_tuple:
+        if use_list:
+            return list(outputs)
+        else:
+            return tuple(outputs)
+    else:
+        return outputs
+
+
 def grad_sources_inputs(sources, graph_inputs, warn_type=True):
    """
    :type sources: list of pairs of Variable: (v, gradient-on-v)
    :param sources: gradients to back-propagate using chain rule
    :type graph_inputs: list of Variable
-    :param graph_inputs: variables considered to be constant (do not backpropagate through
-    them)
+    :param graph_inputs: variables considered to be constant
+        (do not backpropagate through them)

    :rtype: dictionary whose keys and values are of type `Variable`
-    :return: mapping from each Variable encountered in the backward traversal to the gradient with respect to that Variable.
+
+    :return: mapping from each Variable encountered in the backward
+        traversal to the gradient with respect to that Variable.

    It is assumed that there is some objective J shared between all members of
-    sources, so that for each v, gradient-on-v is the gradient of J with respect to v
+    sources, so that for each v, gradient-on-v is the gradient of J with
+    respect to v



@@ -50,24 +87,26 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
            else:
                gmap[r] = g_r

-    graph_outputs = gof.utils.uniq([r for r,g in sources])
+    graph_outputs = gof.utils.uniq([r for r, g in sources])

    if graph_inputs is None:
        graph_inputs = gof.graph.inputs(graph_outputs)

-    for node in gof.graph.io_toposort(graph_inputs, graph_outputs).__reversed__():
-        g_outputs = [gmap.get(o,None) for o in node.outputs]
+    for node in gof.graph.io_toposort(graph_inputs,
+                                      graph_outputs).__reversed__():
+        g_outputs = [gmap.get(o, None) for o in node.outputs]

        #if all output gradients are None, continue
-        if all(map(lambda x:x is None, g_outputs)): continue
+        if all(map(lambda x: x is None, g_outputs)): continue

        output_arg = g_outputs
        input_arg = node.inputs

        # Each Op's grad function requires inputs and output_grads
-        # If the Op destroys any input, but the grad expression uses it, then chances are the
-        # resulting graph will have a dependency cycle.  We avoid this cycle by passing
-        # (symbolic) copies of each destroyed input.
+        # If the Op destroys any input, but the grad expression uses it,
+        # then chances are the resulting graph will have a dependency
+        # cycle.  We avoid this cycle by passing (symbolic) copies of
+        # each destroyed input.
        try:
            dinputs = [node.inputs[x[0]] for x in node.op.destroy_map.values()]
        except AttributeError:
@@ -83,14 +122,14 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):

        #note that this function is not in a try-except block
        # the rationale:
-        #  If the op implements grad, then any exception should be passed to the
-        #  caller
+        #  If the op implements grad, then any exception should be passed to
+        #  the caller
        #  If the op doesn't implement grad, this entire function should fail.
        #  Other possibilities:
        #    * return a partial back-prop
        #
        op_grad = node.op.grad(input_arg, output_arg)
-        if not isinstance(op_grad, (list,tuple)):
+        if not isinstance(op_grad, (list, tuple)):
            raise ValueError(_msg_retType, node.op)
        g_inputs = op_grad
        assert isinstance(g_inputs, (list, tuple))
@@ -101,9 +140,9 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
                    len(node.inputs))
        for ii, (r, g_r) in enumerate(zip(node.inputs, g_inputs)):
            if warn_type:
-                if g_r and (getattr(r,'type',0) != getattr(g_r,'type', 1)):
-                    r_type = getattr(r,'type', None)
-                    g_r_type = getattr(g_r,'type', None)
+                if g_r and (getattr(r, 'type', 0) != getattr(g_r, 'type', 1)):
+                    r_type = getattr(r, 'type', None)
+                    g_r_type = getattr(g_r, 'type', None)
                    _logger.warning('%s.grad returned a different type (%s) '
                            'for input %i of type (%s)',
                            node.op, g_r_type, ii, r_type)
@@ -117,25 +156,823 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
                    gmap[r] = g_r
    return gmap

+
 def unimplemented_grad(op, x_pos, x):
    """
-    DO NOT USE. Remove this function after all usage of it has been removed from theano.
+    DO NOT USE. Remove this function after all usage of it has been
+    removed from theano.

    Return an un-computable symbolic variable of type `x.type`.

    If any function tries to compute this un-computable variable, an exception
    (NotImplementedError) will be raised indicating that the gradient on the
    `x_pos`'th input of `op` has not been implemented.
+    """
+    msg = '%s.grad not implemented for input %i' % (op, x_pos)
+    return Raise(msg=msg)(x)

+########################
+# R Operator
+########################
+
+
+def Rop(f, wrt, eval_points):
    """
+    Computes the R operation on `f` wrt to `wrt` evaluated at points given
+    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
+    to `wrt` right muliplied by the eval points.

-    #raise Exception("""
-    #                    unimplemented_grad is not a safe function to use.
-    #                    It depends on catching errors at the run-time of a theano function.
-    #                    However, it could be removed by the optimization during the compilation
-    #                    of the theano function, for example, if it is multiplied by 0. This
-    #                    results in theano functions returning 0 for gradients that are actually
-    #                    undefined. """)
+    :type f: `Variable` or list of `Variable`s
+        `f` stands for the output of the computational graph to which you
+        want to apply the R operator
+    :type wrt: `Variable` or list of `Variables`s
+        variables for which you compute the R operator of the expression
+        described by `f`
+    :type eval_points: `Variable` or list of `Variable`s
+        evalutation points for each of the variables in `wrt`

-    msg = '%s.grad not implemented for input %i'%(op, x_pos)
-    return Raise(msg=msg)(x)
+    :rtype: `Variable` or list/tuple of `Variable`s depending on type of f
+    :return: symbolic expression such that
+        R_op[i] = sum_j ( d f[i] / d wrt[j]) eval_point[j]
+        where the indices in that expression are magic multidimensional
+        indices that specify both the position within a list and all
+        coordinates of the tensor element in the last.
+        If `wrt` is a list/tuple, then return a list/tuple with the results.
+        """
+    from theano.tensor import as_tensor_variable
+    using_list = isinstance(f, list)
+    using_tuple = isinstance(f, tuple)
+    if not isinstance(wrt, (list, tuple)):
+        wrt = [wrt]
+
+    if not isinstance(eval_points, (list, tuple)):
+        eval_points = [eval_points]
+
+    if not isinstance(f, (list, tuple)):
+        f = [f]
+
+    assert len(wrt) == len(eval_points)
+
+    # Check that each element of wrt corresponds to an element
+    # of eval_points with the same dimensionality.
+    for pack in enumerate(zip(wrt, eval_points)):
+        i = pack[0]
+        wrt_elem, eval_point = pack[1]
+
+        wrt_elem = as_tensor_variable(wrt_elem)
+        eval_point = as_tensor_variable(eval_point)
+
+        wrt_dim = len(wrt_elem.type.broadcastable)
+        eval_dim = len(eval_point.type.broadcastable)
+
+        if wrt_dim != eval_dim:
+            raise ValueError('Element ' +
+                             str(i) +
+                             ' of wrt/eval_point have mismatched ' +
+                             'dimensionality: ' +
+                             str(wrt_dim) +
+                             ' versus ' +
+                             str(eval_dim))
+
+    seen_nodes = {}
+
+    def _traverse(node):
+        """ TODO: writeme """
+        if node is None:
+            return None
+        else:
+            op = node.op
+            inputs = node.inputs
+
+            # Compute the evaluation points corresponding to each of the
+            # inputs of the node
+            local_eval_points = []
+            for inp in inputs:
+                if inp in wrt:
+                    local_eval_points.append(eval_points[wrt.index(inp)])
+                elif inp.owner is None:
+                    local_eval_points.append(inp.zeros_like())
+                elif inp.owner in seen_nodes:
+
+                    local_eval_points.append(
+                        seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
+
+                else:
+                    # We actually need to compute the R_op for this node
+
+                    _traverse(inp.owner)
+                    local_eval_points.append(
+                        seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
+            for x, y in zip(inputs, local_eval_points):
+                if y is not None:
+                    assert (as_tensor_variable(x).type ==
+                            as_tensor_variable(y).type)
+
+            seen_nodes[node] = op.R_op(node.inputs, local_eval_points)
+            return None
+
+    # Populate the dictionary
+    for out in f:
+        _traverse(out.owner)
+
+    rval = []
+    for out in f:
+        if out in wrt:
+            rval.append(eval_points[wrt.index(out)])
+        elif seen_nodes[out.owner][out.owner.outputs.index(out)] is None:
+            raise ValueError(('The function is not differentiable with '
+                              'respect to the provided inputs !'))
+        else:
+            rval.append(seen_nodes[out.owner][out.owner.outputs.index(out)])
+
+    return format_as(using_list, using_tuple, rval)
+
+
+def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
+         disconnected_inputs='raise'):
+    """
+    Computes the L operation on `f` wrt to `wrt` evaluated at points given
+    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
+    to `wrt` left muliplied by the eval points.
+
+    :type f: `Variable` or list of `Variable`s
+        `f` stands for the output of the computational graph to which you
+        want to apply the L operator
+    :type wrt: `Variable` or list of `Variables`s
+        variables for which you compute the L operator of the expression
+        described by `f`
+    :type eval_points: `Variable` or list of `Variable`s
+        evalutation points for each of the variables in `f`
+
+    :rtype: `Variable` or list/tuple of `Variable`s depending on type of f
+    :return: symbolic expression such that
+        L_op[i] = sum_i ( d f[i] / d wrt[j]) eval_point[i]
+        where the indices in that expression are magic multidimensional
+        indices that specify both the position within a list and all
+        coordinates of the tensor element in the last
+        If `f` is a list/tuple, then return a list/tuple with the results.
+    """
+    if consider_constant is None:
+        consider_constant = []
+
+    if not isinstance(f, Variable):
+        raise TypeError(('In Lop(), cost argument should be '
+                        'a Variable.'), f)
+
+    if type(eval_points) not in (list, tuple):
+        eval_points = [eval_points]
+
+    using_list = isinstance(wrt, list)
+    using_tuple = isinstance(wrt, tuple)
+
+    if not isinstance(f, (list, tuple)):
+        f = [f]
+
+    inputs = gof.graph.inputs(f)
+    gmap = grad_sources_inputs(
+        zip(f, eval_points),
+        list(inputs) + list(consider_constant),
+        warn_type=warn_type)
+
+    # Note : If p is not in gmap there can be several reasons, among which
+    # is the fact that p might not be part of the computational graph. A
+    # simple example is that for a+b for e.g. a[0] is not part of the graph,
+    # so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
+    # such subtle cases can be fixed by a more careful implementation of the
+    # gradient, but for now Theano needs to throw an exception, and make the
+    # user aware that it does not know how to compute that gradient
+    if not isinstance(wrt, (list, tuple)):
+        wrt = [wrt]
+    ret = []
+    for p in wrt:
+        if p in gmap:
+            ret.append(gmap[p])
+        else:
+            message = ("Lop method was asked to compute the gradient "
+                    "with respect to a variable that is not part of "
+                    "the computational graph of the cost, or is used "
+                    "only by a non-differentiable operator: %s" % p)
+            if disconnected_inputs == 'ignore':
+                pass
+            elif disconnected_inputs == 'warn':
+                warnings.warn(message, stacklevel=1)
+            elif disconnected_inputs == 'raise':
+                raise ValueError(message)
+            else:
+                raise ValueError("Invalid value for keyword "
+                        "'disconnected_inputs', valid values are "
+                        "'ignore', 'warn' and 'raise'.")
+            ret.append(p.zeros_like())
+
+    return format_as(using_list, using_tuple, ret)
+
+
+#########################
+# Gradient
+#########################
+
+def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
+         disconnected_inputs='raise'):
+    """
+    :type cost: Scalar (0-dimensional) `Variable`
+    :type wrt: `Variable` or list of `Variable`s.
+    :type g_cost: Scalar `Variable`, or None
+    :param g_cost: an expression for the gradient through cost.  The default is
+        ``ones_like(cost)``.
+    :param consider_constant: a list of expressions not to backpropagate
+        through
+
+    :param warn_type: a value of True will cause warnings to be logged for any
+        Op that emits a gradient that does not match its input type.
+
+    :type disconnected_inputs: string
+    :param disconnected_inputs: Defines the behaviour if some of the variables
+        in ``wrt`` are not part of the computational graph computing ``cost``
+        (or if all links are non-differentiable). The possible values are:
+        - 'ignore': considers that the gradient on these parameters is zero.
+        - 'warn': consider the gradient zero, and print a warning.
+        - 'raise': raise an exception.
+
+    :rtype: `Variable` or list/tuple of `Variable`s (depending upon `wrt`)
+
+    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
+             If an element of `wrt` is not differentiable with respect
+             to the output, then a zero variable is returned.
+             It returns an object of same type as `wrt`: a list/tuple
+             or Variable in all cases.
+
+    This function is a wrapper around the more general function
+    `theano.gradient.grad_sources_inputs``.
+
+    """
+    if consider_constant is None:
+        consider_constant = []
+    else:
+        #error checking on consider_constant: verify that it is a collection
+        # of theano variables
+        # this is important, if someone accidentally passes a nested data
+        # structure with theano variables at the leaves, only the root will
+        # be properly considered constant
+        if not hasattr(consider_constant, '__iter__'):
+            raise TypeError('consider_constant must be an iterable collection,'
+                    ' got ' + str(type(consider_constant)))
+        for elem in consider_constant:
+            if not isinstance(elem, gof.Variable):
+                raise TypeError('Elements of consider_constant must be '
+                                'variables, but got ' + str(type(elem)))
+
+    if not isinstance(cost, Variable):
+        raise TypeError(('In grad(), cost argument should be '
+                         'a Variable.'), cost)
+
+    if cost.type.ndim:
+        raise TypeError(
+                'In theano.gradient.grad, "cost" argument should be a scalar,'
+                ' but ndim is %i (should be 0). If you want to compute the'
+                ' gradient of the sum of cost, you should use cost.sum().'
+                % cost.type.ndim)
+
+    if g_cost is None:
+        from theano import tensor
+        g_cost = tensor.ones_like(cost)
+    inputs = gof.graph.inputs([cost])
+    gmap = grad_sources_inputs(
+        [(cost, g_cost)],
+        list(inputs) + list(consider_constant),
+        warn_type=warn_type)
+
+    # Note : If p is not in gmap there can be several reasons, among which
+    # is the fact that p might not be part of the computational graph. A
+    # simple example is that for a+b for e.g. a[0] is not part of the graph,
+    # so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
+    # such subtle cases can be fixed by a more careful implementation of the
+    # gradient, but for now Theano needs to throw an exception, and make the
+    # user aware that it does not know how to compute that gradient
+    using_list = isinstance(wrt, list)
+    using_tuple = isinstance(wrt, tuple)
+
+    if not isinstance(wrt, (list, tuple)):
+        wrt = [wrt]
+    ret = []
+    for p in wrt:
+        if p in gmap:
+            ret.append(gmap[p])
+        else:
+            message = ("grad method was asked to compute the gradient "
+                    "with respect to a variable that is not part of "
+                    "the computational graph of the cost, or is used "
+                    "only by a non-differentiable operator: %s" % p)
+            if disconnected_inputs == 'ignore':
+                pass
+            elif disconnected_inputs == 'warn':
+                warnings.warn(message, stacklevel=1)
+            elif disconnected_inputs == 'raise':
+                raise ValueError(message)
+            else:
+                raise ValueError("Invalid value for keyword "
+                        "'disconnected_inputs', valid values are "
+                        "'ignore', 'warn' and 'raise'.")
+            ret.append(p.zeros_like())
+
+    return format_as(using_list, using_tuple, ret)
+
+
+class numeric_grad(object):
+    """
+    Compute the numeric derivative of a scalar-valued function at a particular
+    point.
+    """
+
+    # Note on step sizes and tolerances:
+    #
+    # There is a relationship between the step size and the function value and
+    # the measurement error that is incurred due to rounding.  The finite
+    # difference we measure is
+    # delta = f(x0) - f(x0+eps)
+    #
+    # For maximum precision, f should be close to zero.
+    # For every power of 2 that f departs from zero, we lose a bit of precision
+    # in delta.
+    #
+    # Even in this case of maximum accuracy, there is a tradeoff between
+    # stepsize and measurement error.
+    # Taking small steps allows us to measure large derivatives accuractly,
+    # but longer steps are required to measure small derivatives accurately.
+    # However longer steps introduce bias into our measurement in general
+    # for non-linear functions.
+    #
+    # It would be interesting to have a version of numeric grad that used an
+    # adaptive stepsize.
+    #
+    # For now, we use a heuristic that catches very bad gradients, but is not
+    # perfectly accurate.
+    type_eps = {'float64': 1e-7,
+            'float32': 3e-4,
+            numpy.dtype('float64'): 1e-7,
+            numpy.dtype('float32'): 3e-4}
+
+    def __init__(self, f, pt, eps=None):
+        """Return the gradient of f at pt.
+
+        :param f: a differentiable function such that f(*pt) is a scalar
+        :param pt: an ndarray, a list of ndarrays or tuple of ndarrays
+
+        This function computes the gradient by a one-sided finite
+        differences of a fixed step size (eps).
+
+        It is assumed that f(...) will return a scalar.
+        It is assumed that all f's inputs are numpy.ndarray objects.
+
+        :param eps: the stepsize for the finite differencing.  None means
+          input dtype-dependent. See `type_eps`.
+        """
+
+        def prod(inputs):
+            rval = 1
+            for i in inputs:
+                rval *= i
+            return rval
+
+        packed_pt = False
+        if not isinstance(pt, (list, tuple)):
+            pt = [pt]
+            packed_pt = True
+
+        apt = [numpy.array(p) for p in pt]
+
+        shapes = [p.shape for p in apt]
+        dtypes = [str(p.dtype) for p in apt]
+
+        # TODO: remove this eventually (why was this here in the first place ?)
+        # In the case of CSM, the arguments are a mixture of floats and
+        # integers...
+        # if not dtypes == [dtypes[0]] * len(apt):
+        #      raise TypeError('All function arguments must have same dtype')
+
+        total_size = __builtin__.sum(prod(sh) for sh in shapes)
+
+        working_dtype = __builtin__.min((self.type_eps[dt], dt)
+                                        for dt in dtypes)[1]
+
+        #create un-initialized memory
+        x = numpy.ndarray((total_size,), dtype=working_dtype)
+        gx = numpy.ndarray((total_size,), dtype=working_dtype)
+
+        if eps is None:
+            eps = __builtin__.max(self.type_eps[dt] for dt in dtypes)
+
+        #set up aliases so that apt[i] is backed by memory in x
+        # and self.gf is backed by memory in gx
+        cur_pos = 0
+        self.gf = []
+        for i, p in enumerate(apt):
+            p_size = prod(p.shape)
+            # set up alias
+            apt[i] = x[cur_pos: cur_pos + p_size].reshape(p.shape)
+            self.gf.append(gx[cur_pos: cur_pos + p_size].reshape(p.shape))
+            # initialize with p's value
+            apt[i][...] = p
+            cur_pos += p_size
+
+        f_x = f(*[p.copy() for p in apt])
+
+        # now iterate over the elements of x, and call f on apt.
+        x_copy = x.copy()
+        for i in xrange(total_size):
+            x[:] = x_copy
+
+            x[i] += eps
+            f_eps = f(*apt)
+
+            gx[i] = numpy.asarray((f_eps - f_x) / eps)
+
+        if packed_pt:
+            self.gf = self.gf[0]
+
+    @staticmethod
+    def abs_rel_err(a, b):
+        """Return absolute and relative error between a and b.
+
+        The relative error is a small number when a and b are close, relative
+        to how big they are.
+
+        Formulas used:
+            abs_err = abs(a - b)
+            rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
+
+        The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
+        are both close to 0.
+
+        The tuple (abs_err, rel_err) is returned
+        """
+        abs_err = abs(a - b)
+        rel_err = abs_err / numpy.maximum(abs(a) + abs(b), 1e-8)
+        return (abs_err, rel_err)
+
+    def abs_rel_errors(self, g_pt):
+        """Return the abs and rel error of gradient estimate `g_pt`
+
+        `g_pt` must be a list of ndarrays of the same length as self.gf,
+        otherwise a ValueError is raised.
+
+        Corresponding ndarrays in `g_pt` and `self.gf` must have the same
+        shape or ValueError is raised.
+
+        """
+        if len(g_pt) != len(self.gf):
+            raise ValueError(
+                    'argument has wrong number of elements',
+                    len(g_pt))
+        errs = []
+        for i, (a, b) in enumerate(zip(g_pt, self.gf)):
+            if a.shape != b.shape:
+                raise ValueError(
+                        'argument element %i has wrong shape %s' % (
+                            i, str((a.shape, b.shape))))
+            errs.append(numeric_grad.abs_rel_err(a, b))
+        return errs
+
+    def max_err(self, g_pt, abs_tol, rel_tol):
+        """Find the biggest error between g_pt and self.gf.
+
+        What is measured is the violation of relative and absolute errors,
+        wrt the provided tolerances (abs_tol, rel_tol).
+        A value > 1 means both tolerances are exceeded.
+
+        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over
+        g_pt, as well as abs_err and rel_err at this point.
+        """
+        pos = []
+        errs = []
+        abs_errs = []
+        rel_errs = []
+
+        abs_rel_errs = self.abs_rel_errors(g_pt)
+        for abs_err, rel_err in abs_rel_errs:
+            if not numpy.all(numpy.isfinite(abs_err)):
+                raise ValueError('abs_err not finite', repr(abs_err))
+            if not numpy.all(numpy.isfinite(rel_err)):
+                raise ValueError('rel_err not finite', repr(rel_err))
+            scaled_err = numpy.minimum(abs_err / abs_tol, rel_err / rel_tol)
+            max_i = scaled_err.argmax()
+
+            pos.append(max_i)
+            errs.append(scaled_err.flatten()[max_i])
+            abs_errs.append(abs_err.flatten()[max_i])
+            rel_errs.append(rel_err.flatten()[max_i])
+
+        # max over the arrays in g_pt
+        max_arg = numpy.argmax(errs)
+        max_pos = pos[max_arg]
+        return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
+
+
+def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
+                rel_tol=None, mode=None, cast_to_output_type=False):
+    """ Test a gradient by Finite Difference Method. Raise error on failure.
+
+    Example:
+    >>> verify_grad(theano.tensor.tanh,
+                    (numpy.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
+                    rng=numpy.random)
+
+    Raises an Exception if the difference between the analytic gradient and
+    numerical gradient (computed through the Finite Difference Method) of a
+    random projection of the fun's output to a scalar exceeds the given
+    tolerance.
+
+    :param fun: a Python function that takes Theano variables as inputs,
+        and returns a Theano variable. For instance, an Op instance with
+        a single output.
+    :param pt: the list of numpy.ndarrays to use as input values.
+        These arrays must be either float32 or float64 arrays.
+    :param n_tests: number of times to run the test
+    :param rng: random number generator used to sample u, we test gradient
+        of sum(u * fun) at pt
+    :param eps: stepsize used in the Finite Difference Method (Default
+        None is type-dependent)
+    :param abs_tol: absolute tolerance used as threshold for gradient
+        comparison
+    :param rel_tol: relative tolerance used as threshold for gradient
+        comparison
+
+    :note: WARNING to unit-test writers: if `op` is a function that builds
+        a graph, try to make it a SMALL graph.  Often verify grad is run
+        in debug mode, which can be very slow if it has to verify a lot of
+        intermediate computations.
+
+    :note: This op does not support multiple outputs. In tests/test_scan.py
+        there is an experimental verify_grad that covers that case as well
+        by using random projections.
+    """
+    from theano import compile, shared
+    import theano.tensor
+    from theano.tensor import as_tensor_variable, cast, TensorType
+    assert isinstance(pt, (list, tuple))
+    pt = [numpy.array(p) for p in pt]
+
+    for i, p in enumerate(pt):
+        if p.dtype not in ('float32', 'float64'):
+            raise TypeError(('verify_grad can work only with floating point '
+                'inputs, but input %i has dtype "%s".') % (i, p.dtype))
+
+    _type_tol = dict(  # relativ error tolerances for different types
+            float32=1e-2,
+            float64=1e-4)
+
+    if abs_tol is None:
+        abs_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
+    if rel_tol is None:
+        rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
+
+    if rng is None:
+        raise TypeError(('rng should be a valid instance of '
+                        'numpy.random.RandomState. You may '
+                         'want to use theano.tests.unittest'
+                         '_tools.verify_grad instead of '
+                         'theano.gradient.verify_grad.'))
+
+    # We allow input downcast in function, because numeric_grad works in the
+    # most precise dtype used among the inputs, so we may need to cast some.
+    def function(inputs, output):
+        if mode is None:
+            f = compile.function(inputs, output, accept_inplace=True,
+                    allow_input_downcast=True)
+        else:
+            f = compile.function(inputs, output, accept_inplace=True,
+                    allow_input_downcast=True, mode=mode)
+        return f
+
+    tensor_pt = [TensorType(
+            as_tensor_variable(p).dtype,
+            as_tensor_variable(p).broadcastable)(name='input %i' % i)
+        for i, p in enumerate(pt)]
+
+    #fun can be either a function or an actual Op instance
+    o_output = fun(*tensor_pt)
+
+    if isinstance(o_output, list):
+        raise NotImplementedError(('cant (yet) autotest gradient of fun '
+                                   'with multiple outputs'))
+        # we could make loop over outputs making random projections R for each,
+        # but this doesn't handle the case where not all the outputs are
+        # differentiable... so I leave this as TODO for now -JB.
+
+    o_fn = function(tensor_pt, o_output)
+    o_fn_out = o_fn(*[p.copy() for p in pt])
+
+    if isinstance(o_fn_out, tuple) or isinstance(o_fn_out, list):
+        raise TypeError('It seems like you are trying to use verify_grad '
+                'on an op or a function which outputs a list: there should'
+                ' be a single (array-like) output instead')
+
+    # random_projection should not have elements too small,
+    # otherwise too much precision is lost in numerical gradient
+    def random_projection():
+        plain = rng.rand(*o_fn_out.shape) + 0.5
+        if cast_to_output_type:
+            return numpy.array(plain, o_output.dtype)
+        return plain
+
+    t_r = shared(random_projection())
+
+    # random projection of o onto t_r
+    # This sum() is defined above, it's not the builtin sum.
+    cost = theano.tensor.sum(t_r * o_output)
+
+    cost_fn = function(tensor_pt, cost)
+
+    #todo-- determine if this is actually needed
+    g_cost = as_tensor_variable(1.0, name='g_cost')
+    if cast_to_output_type:
+        g_cost = cast(g_cost, o_output.dtype)
+
+    symbolic_grad = grad(cost, tensor_pt, g_cost,
+                         disconnected_inputs='ignore')
+
+    grad_fn = function(tensor_pt, symbolic_grad)
+
+    for test_num in xrange(n_tests):
+        num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps)
+
+        analytic_grad = grad_fn(*[p.copy() for p in pt])
+
+        # Since `tensor_pt` is a list, `analytic_grad` should be one too.
+        assert isinstance(analytic_grad, list)
+
+        max_arg, max_err_pos, max_abs_err, max_rel_err =\
+                num_grad.max_err(analytic_grad, abs_tol, rel_tol)
+
+        if max_abs_err > abs_tol and max_rel_err > rel_tol:
+            raise verify_grad.E_grad(max_arg, max_err_pos,
+                    max_abs_err, max_rel_err, abs_tol, rel_tol)
+
+        #get new random projection for next test
+        if test_num < n_tests - 1:
+            t_r.set_value(random_projection(), borrow=True)
+
+
+class GradientError(Exception):
+    """This error is raised when a gradient is calculated, but incorrect."""
+    def __init__(self, arg, err_pos, abs_err, rel_err, abs_tol, rel_tol):
+        self.arg = arg
+        self.err_pos = err_pos
+        self.abs_err = abs_err
+        self.rel_err = rel_err
+        self.abs_tol = abs_tol
+        self.rel_tol = rel_tol
+
+    def __str__(self):
+        # args may have been inserted by e.g. makeTester
+        args_msg = ", ".join(str(a) for a in self.args)
+        return """\
+GradientError: numeric gradient and analytic gradient exceed tolerance:
+        At position %i of argument %i,
+            abs. error = %f,  abs. tolerance = %f
+            rel. error = %f,  rel. tolerance = %f
+Exception args: %s""" % (self.err_pos, self.arg,
+                         self.abs_err, self.abs_tol,
+                         self.rel_err, self.rel_tol,
+                         args_msg)
+
+verify_grad.E_grad = GradientError
+
+
+def jacobian(expression, wrt, consider_constant=None, warn_type=False,
+             disconnected_inputs='raise'):
+    """
+    :type expression: Vector (1-dimensional) `Variable`
+    :type wrt: 'Variable' or list of `Variables`s
+
+    :param consider_constant: a list of expressions not to backpropagate
+        through
+
+    :param warn_type: a value of True will cause warnings to be logged for any
+        Op that emits a gradient that does not match its input type.
+
+    :type disconnected_inputs: string
+    :param disconnected_inputs: Defines the behaviour if some of the variables
+        in ``wrt`` are not part of the computational graph computing ``cost``
+        (or if all links are non-differentiable). The possible values are:
+        - 'ignore': considers that the gradient on these parameters is zero.
+        - 'warn': consider the gradient zero, and print a warning.
+        - 'raise': raise an exception.
+
+    :return: either a instance of `Variable` or list/tuple of `Variable`s
+            (depending upon `wrt`) repesenting the jacobian of `expression`
+            with respect to (elements of) `wrt`. If an element of `wrt` is not
+            differentiable with respect to the output, then a zero
+            variable is returned. The return value is of same type
+            as `wrt`: a list/tuple or TensorVariable in all cases.
+    """
+    from theano.tensor import arange
+    # Check inputs have the right format
+    assert isinstance(expression, Variable), \
+            "tensor.jacobian expects a Variable as `expression`"
+    assert expression.ndim < 2, \
+            ("tensor.jacobian expects a 1 dimensional variable as "
+             "`expression`. If not use flatten to make it a vector")
+
+    using_list = isinstance(wrt, list)
+    using_tuple = isinstance(wrt, tuple)
+
+    if isinstance(wrt, (list, tuple)):
+        wrt = list(wrt)
+    else:
+        wrt = [wrt]
+
+    if expression.ndim == 0:
+        # expression is just a scalar, use grad
+        return format_as(using_list, using_tuple, grad(expression, wrt))
+
+    def inner_function(*args):
+        idx = args[0]
+        expr = args[1]
+        rvals = []
+        for inp in args[2:]:
+            rval = grad(expr[idx],
+                     inp,
+                     consider_constant=consider_constant,
+                     warn_type=warn_type,
+                     disconnected_inputs=disconnected_inputs)
+            rvals.append(rval)
+        return rvals
+    # Computing the gradients does not affect the random seeds on any random
+    # generator used n expression (because during computing gradients we are
+    # just backtracking over old values. (rp Jan 2012 - if anyone has a
+    # counter example please show me)
+    jacobs, updates = theano.scan(inner_function,
+                            sequences=arange(expression.shape[0]),
+                            non_sequences=[expression] + wrt)
+    assert not updates, \
+            ("Scan has returned a list of updates. This should not "
+             "happen! Report this to theano-users (also include the "
+             "script that generated the error)")
+    return format_as(using_list, using_tuple, jacobs)
+
+
+def hessian(cost, wrt, consider_constant=None, warn_type=False,
+             disconnected_inputs='raise'):
+    """
+    :type cost: Scalar (0-dimensional) `Variable`
+    :type wrt: Vector (1-dimensional tensor) 'Variable' or list of
+            vectors (1-dimensional tensors) `Variable`s
+
+    :param consider_constant: a list of expressions not to backpropagate
+        through
+
+    :param warn_type: a value of True will cause warnings to be logged for any
+        Op that emits a gradient that does not match its input type.
+
+    :type disconnected_inputs: string
+    :param disconnected_inputs: Defines the behaviour if some of the variables
+        in ``wrt`` are not part of the computational graph computing ``cost``
+        (or if all links are non-differentiable). The possible values are:
+        - 'ignore': considers that the gradient on these parameters is zero.
+        - 'warn': consider the gradient zero, and print a warning.
+        - 'raise': raise an exception.
+
+    :return: either a instance of `Variable` or list/tuple of `Variable`s
+            (depending upon `wrt`) repressenting the Hessian of the `cost`
+            with respect to (elements of) `wrt`. If an element of `wrt` is not
+            differentiable with respect to the output, then a zero
+            variable is returned. The return value is of same type
+            as `wrt`: a list/tuple or TensorVariable in all cases.
+    """
+    from theano.tensor import arange
+    # Check inputs have the right format
+    assert isinstance(cost, Variable), \
+            "tensor.hessian expects a Variable as `cost`"
+    assert cost.ndim == 0, \
+            "tensor.hessian expects a 0 dimensional variable as `cost`"
+
+    using_list = isinstance(wrt, list)
+    using_tuple = isinstance(wrt, tuple)
+
+    if isinstance(wrt, (list, tuple)):
+        wrt = list(wrt)
+    else:
+        wrt = [wrt]
+
+    hessians = []
+    for input in wrt:
+        assert isinstance(input, Variable), \
+                "tensor.hessian expects a (list of) Variable as `wrt`"
+        assert input.ndim == 1, \
+                "tensor.hessian expects a (list of) 1 dimensional variable "\
+                "as `wrt`"
+        expr = grad(cost, input)
+        hess, updates = theano.scan(lambda i, y, x: grad(
+                            y[i],
+                            x,
+                            consider_constant=consider_constant,
+                            warn_type=warn_type,
+                            disconnected_inputs=disconnected_inputs),
+                       sequences=arange(expr.shape[0]),
+                       non_sequences=[expr, input])
+        assert not updates, \
+                ("Scan has returned a list of updates. This should not "
+                 "happen! Report this to theano-users (also include the "
+                 "script that generated the error)")
+        hessians.append(hess)
+    return format_as(using_list, using_tuple, hessians)
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -137,9 +137,13 @@ def sp_ones_like(x):
    data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
    return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape)

+
 def sp_zeros_like(x):
-    _, _, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
-    return CSM(format=x.format)(numpy.array([], dtype=x.type.dtype), numpy.array([]), tensor.zeros_like(indptr), shape)
+    #TODO: don't restrict to CSM formats
+    _, _, indptr, shape = csm_properties(x)
+    return CSM(format=x.format)(numpy.array([], dtype=x.type.dtype),
+                                numpy.array([]), tensor.zeros_like(indptr),
+                                shape)


 class _sparse_py_operators:
@@ -177,6 +181,9 @@ class _sparse_py_operators:
    # that stored zeros *do* count in the size.
    size = property(lambda self: csm_data(self).size)

+    def zeros_like(model):
+        return sp_zeros_like(model)
+

 class SparseVariable(gof.Variable, _sparse_py_operators):
    dtype = property(lambda self: self.type.dtype)
@@ -189,10 +196,6 @@ class SparseVariable(gof.Variable, _sparse_py_operators):
    def __repr__(self):
        return str(self)

-    def zeros_like(model, dtype=None):
-        # TODO: don't ignore dtype
-        return sp_zeros_like(model)
-
 class SparseConstantSignature(tuple):
    def __eq__(self, other):
        (a, b), (x,y) = self, other

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -824,13 +824,16 @@ class test_zeros_like(unittest.TestCase):
    def test(self):
        x = theano.sparse.csr_matrix()
        f = theano.function([x], theano.sparse.sp_zeros_like(x))
-        vx = scipy.sparse.csr_matrix(numpy.asarray(numpy.random.binomial(1, 0.5, (100, 100)), dtype=theano.config.floatX))
+        vx = scipy.sparse.csr_matrix(numpy.asarray(
+                numpy.random.binomial(1, 0.5, (100, 100)),
+                dtype=theano.config.floatX))

        fx = f(vx)

        assert fx.nnz == 0
        assert fx.shape == vx.shape

+
 def test_shape_i():
    sparse_dtype = 'float32'


--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -30,7 +30,6 @@ import sharedvar  # adds shared-variable constructors
 # `theano.shared` and `tensor._shared`.
 from sharedvar import tensor_constructor as _shared

-
 def shared(*args, **kw):
    """
    Backward-compatibility wrapper around `tensor._shared`.
@@ -50,6 +49,5 @@ def shared(*args, **kw):

 import nnet  # used for softmax, sigmoid, etc.

-
-from tensor_grad import Rop, Lop, grad, numeric_grad, verify_grad, \
+from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
    jacobian, hessian
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1450,16 +1450,12 @@ class _tensor_py_operators:

    def get_constant_value(self):
        return get_constant_value(self)
+    def zeros_like(model):
+        return zeros_like(model)


 class TensorVariable(_tensor_py_operators, Variable):
    """Subclass to add the tensor operators to the basic `Variable` class."""
-    def zeros_like(model, dtype=None):
-        "Used for grad, Lop and Rop"
-        # Tested through the zeros_like method below
-        if dtype is None:
-            dtype = model.type.dtype
-        return fill(model, constant(0.0, dtype=dtype))

 TensorType.Variable = TensorVariable

@@ -2369,7 +2365,9 @@ def ones_like(model, dtype=None):
 @constructor
 def zeros_like(model, dtype=None):
    """equivalent of numpy.zeros_like"""
-    return TensorVariable.zeros_like(model, dtype=None)
+    if dtype is None:
+        dtype = model.type.dtype
+    return fill(model, constant(0.0, dtype=dtype))

 def zeros(shape, dtype=config.floatX):
    """

--- a/theano/tensor/tensor_grad.py
+++ b/theano/tensor/tensor_grad.py
-"""Driver for gradient calculations."""
-
-__authors__ = "James Bergstra, Razvan Pascanu"
-__copyright__ = "(c) 2011, Universite de Montreal"
-__license__ = "3-clause BSD License"
-__contact__ = "theano-dev <theano-dev@googlegroups.com>"
-
-__docformat__ = "restructuredtext en"
-
-import __builtin__
-import logging
-import warnings
-
-import numpy  # for numeric_grad
-
-import theano
-from theano.tensor import TensorType, TensorVariable, ones_like, \
-                zeros_like, as_tensor_variable, cast, arange
-from theano import gradient
-from theano import gof, shared
-from theano import compile
-
-
-_logger = logging.getLogger('theano.tensor.tensor_grad')
-
-
-def format_as(use_list, use_tuple, outputs):
-    """
-    Formats the outputs according to the flags `use_list` and `use_tuple`.
-    If `use_list` is True, `outputs` is returned as a list (if `outputs`
-    is not a list or a tuple then it is converted in a one element list).
-    If `use_tuple` is True, `outputs` is returned as a tuple (if `outputs`
-    is not a list or a tuple then it is converted into a one element tuple).
-    Otherwise (if both flags are false), `outputs` is returned.
-    """
-    assert not (use_list and use_tuple), \
-            "Both flags cannot be simultaneously True"
-    if (use_list or use_tuple) and not isinstance(outputs, (list, tuple)):
-        if use_list:
-            return [outputs]
-        else:
-            return (outputs,)
-    elif not (use_list or use_tuple) and isinstance(outputs, (list, tuple)):
-        assert len(outputs) == 1, \
-                "Wrong arguments. Expected a one element list"
-        return outputs[0]
-    elif use_list or use_tuple:
-        if use_list:
-            return list(outputs)
-        else:
-            return tuple(outputs)
-    else:
-        return outputs
-
-########################
-# R Operator
-########################
-
-
-def Rop(f, wrt, eval_points):
-    """
-    Computes the R operation on `f` wrt to `wrt` evaluated at points given
-    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
-    to `wrt` right muliplied by the eval points.
-
-    :type f: `Variable` or list of `Variable`s
-        `f` stands for the output of the computational graph to which you
-        want to apply the R operator
-    :type wrt: `Variable` or list of `Variables`s
-        variables for which you compute the R operator of the expression
-        described by `f`
-    :type eval_points: `Variable` or list of `Variable`s
-        evalutation points for each of the variables in `wrt`
-
-    :rtype: `Variable` or list/tuple of `Variable`s depending on type of f
-    :return: symbolic expression such that
-        R_op[i] = sum_j ( d f[i] / d wrt[j]) eval_point[j]
-        where the indices in that expression are magic multidimensional
-        indices that specify both the position within a list and all
-        coordinates of the tensor element in the last.
-        If `wrt` is a list/tuple, then return a list/tuple with the results.
-        """
-
-    using_list = isinstance(f, list)
-    using_tuple = isinstance(f, tuple)
-    if not isinstance(wrt, (list, tuple)):
-        wrt = [wrt]
-
-    if not isinstance(eval_points, (list, tuple)):
-        eval_points = [eval_points]
-
-    if not isinstance(f, (list, tuple)):
-        f = [f]
-
-    assert len(wrt) == len(eval_points)
-
-    # Check that each element of wrt corresponds to an element
-    # of eval_points with the same dimensionality.
-    for pack in enumerate(zip(wrt, eval_points)):
-        i = pack[0]
-        wrt_elem, eval_point = pack[1]
-
-        wrt_elem = as_tensor_variable(wrt_elem)
-        eval_point = as_tensor_variable(eval_point)
-
-        wrt_dim = len(wrt_elem.type.broadcastable)
-        eval_dim = len(eval_point.type.broadcastable)
-
-        if wrt_dim != eval_dim:
-            raise ValueError('Element ' +
-                             str(i) +
-                             ' of wrt/eval_point have mismatched ' +
-                             'dimensionality: ' +
-                             str(wrt_dim) +
-                             ' versus ' +
-                             str(eval_dim))
-
-    seen_nodes = {}
-
-    def _traverse(node):
-        """ TODO: writeme """
-        if node is None:
-            return None
-        else:
-            op = node.op
-            inputs = node.inputs
-
-            # Compute the evaluation points corresponding to each of the
-            # inputs of the node
-            local_eval_points = []
-            for inp in inputs:
-                if inp in wrt:
-                    local_eval_points.append(eval_points[wrt.index(inp)])
-                elif inp.owner is None:
-                    local_eval_points.append(zeros_like(inp))
-                elif inp.owner in seen_nodes:
-
-                    local_eval_points.append(
-                        seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
-
-                else:
-                    # We actually need to compute the R_op for this node
-
-                    _traverse(inp.owner)
-                    local_eval_points.append(
-                        seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
-            for x, y in zip(inputs, local_eval_points):
-                if y is not None:
-                    assert (as_tensor_variable(x).type ==
-                            as_tensor_variable(y).type)
-
-            seen_nodes[node] = op.R_op(node.inputs, local_eval_points)
-            return None
-
-    # Populate the dictionary
-    for out in f:
-        _traverse(out.owner)
-
-    rval = []
-    for out in f:
-        if out in wrt:
-            rval.append(eval_points[wrt.index(out)])
-        elif seen_nodes[out.owner][out.owner.outputs.index(out)] is None:
-            raise ValueError(('The function is not differentiable with '
-                              'respect to the provided inputs !'))
-        else:
-            rval.append(seen_nodes[out.owner][out.owner.outputs.index(out)])
-
-    return format_as(using_list, using_tuple, rval)
-
-
-def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
-         disconnected_inputs='raise'):
-    """
-    Computes the L operation on `f` wrt to `wrt` evaluated at points given
-    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
-    to `wrt` left muliplied by the eval points.
-
-    :type f: `Variable` or list of `Variable`s
-        `f` stands for the output of the computational graph to which you
-        want to apply the L operator
-    :type wrt: `Variable` or list of `Variables`s
-        variables for which you compute the L operator of the expression
-        described by `f`
-    :type eval_points: `Variable` or list of `Variable`s
-        evalutation points for each of the variables in `f`
-
-    :rtype: `Variable` or list/tuple of `Variable`s depending on type of f
-    :return: symbolic expression such that
-        L_op[i] = sum_i ( d f[i] / d wrt[j]) eval_point[i]
-        where the indices in that expression are magic multidimensional
-        indices that specify both the position within a list and all
-        coordinates of the tensor element in the last
-        If `f` is a list/tuple, then return a list/tuple with the results.
-    """
-    if consider_constant is None:
-        consider_constant = []
-
-    if not isinstance(f, TensorVariable):
-        raise TypeError(('In tensor.Lop(), cost argument should be '
-                        'a TensorVariable.'), f)
-
-    if type(eval_points) not in (list, tuple):
-        eval_points = [eval_points]
-
-    using_list = isinstance(wrt, list)
-    using_tuple = isinstance(wrt, tuple)
-
-    if not isinstance(f, (list, tuple)):
-        f = [f]
-
-    inputs = gof.graph.inputs(f)
-    gmap = gradient.grad_sources_inputs(
-            zip(f, eval_points),
-            list(inputs) + list(consider_constant),
-            warn_type=warn_type)
-
-    # Note : If p is not in gmap there can be several reasons, among which
-    # is the fact that p might not be part of the computational graph. A
-    # simple example is that for a+b for e.g. a[0] is not part of the graph,
-    # so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
-    # such subtle cases can be fixed by a more careful implementation of the
-    # gradient, but for now Theano needs to throw an exception, and make the
-    # user aware that it does not know how to compute that gradient
-    if not isinstance(wrt, (list, tuple)):
-        wrt = [wrt]
-    ret = []
-    for p in wrt:
-        if p in gmap:
-            ret.append(gmap[p])
-        else:
-            message = ("Lop method was asked to compute the gradient "
-                    "with respect to a variable that is not part of "
-                    "the computational graph of the cost, or is used "
-                    "only by a non-differentiable operator: %s" % p)
-            if disconnected_inputs == 'ignore':
-                pass
-            elif disconnected_inputs == 'warn':
-                warnings.warn(message, stacklevel=1)
-            elif disconnected_inputs == 'raise':
-                raise ValueError(message)
-            else:
-                raise ValueError("Invalid value for keyword "
-                        "'disconnected_inputs', valid values are "
-                        "'ignore', 'warn' and 'raise'.")
-            ret.append(zeros_like(p))
-
-    return format_as(using_list, using_tuple, ret)
-
-
-#########################
-# Gradient
-#########################
-
-def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
-         disconnected_inputs='raise'):
-    """
-    :type cost: Scalar (0-dimensional) `Variable`
-    :type wrt: `Variable` or list of `Variable`s.
-    :type g_cost: Scalar `Variable`, or None
-    :param g_cost: an expression for the gradient through cost.  The default is
-        ``ones_like(cost)``.
-    :param consider_constant: a list of expressions not to backpropagate
-        through
-
-    :param warn_type: a value of True will cause warnings to be logged for any
-        Op that emits a gradient that does not match its input type.
-
-    :type disconnected_inputs: string
-    :param disconnected_inputs: Defines the behaviour if some of the variables
-        in ``wrt`` are not part of the computational graph computing ``cost``
-        (or if all links are non-differentiable). The possible values are:
-        - 'ignore': considers that the gradient on these parameters is zero.
-        - 'warn': consider the gradient zero, and print a warning.
-        - 'raise': raise an exception.
-
-    :rtype: `Variable` or list/tuple of `Variable`s (depending upon `wrt`)
-
-    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
-             If an element of `wrt` is not differentiable with respect
-             to the output, then a zero variable is returned.
-             It returns an object of same type as `wrt`: a list/tuple
-             or TensorVariable in all cases.
-
-    This function is a wrapper around the more general function
-    `theano.gradient.grad_sources_inputs``.
-
-    """
-    if consider_constant is None:
-        consider_constant = []
-    else:
-        #error checking on consider_constant: verify that it is a collection
-        # of theano variables
-        # this is important, if someone accidentally passes a nested data
-        # structure with theano variables at the leaves, only the root will
-        # be properly considered constant
-        if not hasattr(consider_constant, '__iter__'):
-            raise TypeError('consider_constant must be an iterable collection,'
-                    ' got ' + str(type(consider_constant)))
-        for elem in consider_constant:
-            if not isinstance(elem, gof.Variable):
-                raise TypeError('Elements of consider_constant must be '
-                                'variables, but got ' + str(type(elem)))
-    if not isinstance(cost, TensorVariable):
-        raise TypeError(('In tensor.grad(), cost argument should be '
-                         'a TensorVariable.'), cost)
-
-    if cost.type.ndim:
-        raise TypeError(
-                'In tensor.grad, "cost" argument should be a scalar, but ndim'
-                ' is %i (should be 0). If you want to compute the gradient of'
-                ' the sum of cost, you should use cost.sum().'
-                % cost.type.ndim)
-
-    if g_cost is None:
-        g_cost = ones_like(cost)
-    inputs = gof.graph.inputs([cost])
-    gmap = gradient.grad_sources_inputs(
-            [(cost, g_cost)],
-            list(inputs) + list(consider_constant),
-            warn_type=warn_type)
-
-    # Note : If p is not in gmap there can be several reasons, among which
-    # is the fact that p might not be part of the computational graph. A
-    # simple example is that for a+b for e.g. a[0] is not part of the graph,
-    # so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
-    # such subtle cases can be fixed by a more careful implementation of the
-    # gradient, but for now Theano needs to throw an exception, and make the
-    # user aware that it does not know how to compute that gradient
-    using_list = isinstance(wrt, list)
-    using_tuple = isinstance(wrt, tuple)
-
-    if not isinstance(wrt, (list, tuple)):
-        wrt = [wrt]
-    ret = []
-    for p in wrt:
-        if p in gmap:
-            ret.append(gmap[p])
-        else:
-            message = ("grad method was asked to compute the gradient "
-                    "with respect to a variable that is not part of "
-                    "the computational graph of the cost, or is used "
-                    "only by a non-differentiable operator: %s" % p)
-            if disconnected_inputs == 'ignore':
-                pass
-            elif disconnected_inputs == 'warn':
-                warnings.warn(message, stacklevel=1)
-            elif disconnected_inputs == 'raise':
-                raise ValueError(message)
-            else:
-                raise ValueError("Invalid value for keyword "
-                        "'disconnected_inputs', valid values are "
-                        "'ignore', 'warn' and 'raise'.")
-            ret.append(zeros_like(p))
-
-    return format_as(using_list, using_tuple, ret)
-
-
-class numeric_grad(object):
-    """
-    Compute the numeric derivative of a scalar-valued function at a particular
-    point.
-    """
-
-    # Note on step sizes and tolerances:
-    #
-    # There is a relationship between the step size and the function value and
-    # the measurement error that is incurred due to rounding.  The finite
-    # difference we measure is
-    # delta = f(x0) - f(x0+eps)
-    #
-    # For maximum precision, f should be close to zero.
-    # For every power of 2 that f departs from zero, we lose a bit of precision
-    # in delta.
-    #
-    # Even in this case of maximum accuracy, there is a tradeoff between
-    # stepsize and measurement error.
-    # Taking small steps allows us to measure large derivatives accuractly,
-    # but longer steps are required to measure small derivatives accurately.
-    # However longer steps introduce bias into our measurement in general
-    # for non-linear functions.
-    #
-    # It would be interesting to have a version of numeric grad that used an
-    # adaptive stepsize.
-    #
-    # For now, we use a heuristic that catches very bad gradients, but is not
-    # perfectly accurate.
-    type_eps = {'float64': 1e-7,
-            'float32': 3e-4,
-            numpy.dtype('float64'): 1e-7,
-            numpy.dtype('float32'): 3e-4}
-
-    def __init__(self, f, pt, eps=None):
-        """Return the gradient of f at pt.
-
-        :param f: a differentiable function such that f(*pt) is a scalar
-        :param pt: an ndarray, a list of ndarrays or tuple of ndarrays
-
-        This function computes the gradient by a one-sided finite
-        differences of a fixed step size (eps).
-
-        It is assumed that f(...) will return a scalar.
-        It is assumed that all f's inputs are numpy.ndarray objects.
-
-        :param eps: the stepsize for the finite differencing.  None means input
-        dtype-dependent. See `type_eps`.
-        """
-
-        def prod(inputs):
-            rval = 1
-            for i in inputs:
-                rval *= i
-            return rval
-
-        packed_pt = False
-        if not isinstance(pt, (list, tuple)):
-            pt = [pt]
-            packed_pt = True
-
-        apt = [numpy.array(p) for p in pt]
-
-        shapes = [p.shape for p in apt]
-        dtypes = [str(p.dtype) for p in apt]
-
-        # TODO: remove this eventually (why was this here in the first place ?)
-        # In the case of CSM, the arguments are a mixture of floats and
-        # integers...
-        # if not dtypes == [dtypes[0]] * len(apt):
-        #      raise TypeError('All function arguments must have same dtype')
-
-        total_size = __builtin__.sum(prod(sh) for sh in shapes)
-
-        working_dtype = __builtin__.min((self.type_eps[dt], dt)
-                                        for dt in dtypes)[1]
-
-        #create un-initialized memory
-        x = numpy.ndarray((total_size,), dtype=working_dtype)
-        gx = numpy.ndarray((total_size,), dtype=working_dtype)
-
-        if eps is None:
-            eps = __builtin__.max(self.type_eps[dt] for dt in dtypes)
-
-        #set up aliases so that apt[i] is backed by memory in x
-        # and self.gf is backed by memory in gx
-        cur_pos = 0
-        self.gf = []
-        for i, p in enumerate(apt):
-            p_size = prod(p.shape)
-            # set up alias
-            apt[i] = x[cur_pos: cur_pos + p_size].reshape(p.shape)
-            self.gf.append(gx[cur_pos: cur_pos + p_size].reshape(p.shape))
-            # initialize with p's value
-            apt[i][...] = p
-            cur_pos += p_size
-
-        f_x = f(*[p.copy() for p in apt])
-
-        # now iterate over the elements of x, and call f on apt.
-        x_copy = x.copy()
-        for i in xrange(total_size):
-            x[:] = x_copy
-
-            x[i] += eps
-            f_eps = f(*apt)
-
-            gx[i] = numpy.asarray((f_eps - f_x) / eps)
-
-        if packed_pt:
-            self.gf = self.gf[0]
-
-    @staticmethod
-    def abs_rel_err(a, b):
-        """Return absolute and relative error between a and b.
-
-        The relative error is a small number when a and b are close, relative
-        to how big they are.
-
-        Formulas used:
-            abs_err = abs(a - b)
-            rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
-
-        The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
-        are both close to 0.
-
-        The tuple (abs_err, rel_err) is returned
-        """
-        abs_err = abs(a - b)
-        rel_err = abs_err / numpy.maximum(abs(a) + abs(b), 1e-8)
-        return (abs_err, rel_err)
-
-    def abs_rel_errors(self, g_pt):
-        """Return the abs and rel error of gradient estimate `g_pt`
-
-        `g_pt` must be a list of ndarrays of the same length as self.gf,
-        otherwise a ValueError is raised.
-
-        Corresponding ndarrays in `g_pt` and `self.gf` must have the same
-        shape or ValueError is raised.
-
-        """
-        if len(g_pt) != len(self.gf):
-            raise ValueError(
-                    'argument has wrong number of elements',
-                    len(g_pt))
-        errs = []
-        for i, (a, b) in enumerate(zip(g_pt, self.gf)):
-            if a.shape != b.shape:
-                raise ValueError(
-                        'argument element %i has wrong shape %s' % (
-                            i, str((a.shape, b.shape))))
-            errs.append(numeric_grad.abs_rel_err(a, b))
-        return errs
-
-    def max_err(self, g_pt, abs_tol, rel_tol):
-        """Find the biggest error between g_pt and self.gf.
-
-        What is measured is the violation of relative and absolute errors,
-        wrt the provided tolerances (abs_tol, rel_tol).
-        A value > 1 means both tolerances are exceeded.
-
-        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over
-        g_pt, as well as abs_err and rel_err at this point.
-        """
-        pos = []
-        errs = []
-        abs_errs = []
-        rel_errs = []
-
-        abs_rel_errs = self.abs_rel_errors(g_pt)
-        for abs_err, rel_err in abs_rel_errs:
-            if not numpy.all(numpy.isfinite(abs_err)):
-                raise ValueError('abs_err not finite', repr(abs_err))
-            if not numpy.all(numpy.isfinite(rel_err)):
-                raise ValueError('rel_err not finite', repr(rel_err))
-            scaled_err = numpy.minimum(abs_err / abs_tol, rel_err / rel_tol)
-            max_i = scaled_err.argmax()
-
-            pos.append(max_i)
-            errs.append(scaled_err.flatten()[max_i])
-            abs_errs.append(abs_err.flatten()[max_i])
-            rel_errs.append(rel_err.flatten()[max_i])
-
-        # max over the arrays in g_pt
-        max_arg = numpy.argmax(errs)
-        max_pos = pos[max_arg]
-        return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
-
-
-def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
-                rel_tol=None, mode=None, cast_to_output_type=False):
-    """ Test a gradient by Finite Difference Method. Raise error on failure.
-
-    Example:
-    >>> verify_grad(theano.tensor.tanh,
-                    (numpy.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
-                    rng=numpy.random)
-
-    Raises an Exception if the difference between the analytic gradient and
-    numerical gradient (computed through the Finite Difference Method) of a
-    random projection of the fun's output to a scalar exceeds the given
-    tolerance.
-
-    :param fun: a Python function that takes Theano variables as inputs,
-        and returns a Theano variable. For instance, an Op instance with
-        a single output.
-    :param pt: the list of numpy.ndarrays to use as input values.
-        These arrays must be either float32 or float64 arrays.
-    :param n_tests: number of times to run the test
-    :param rng: random number generator used to sample u, we test gradient of
-        sum(u * fun) at pt
-    :param eps: stepsize used in the Finite Difference Method (Default None is
-        type-dependent)
-    :param abs_tol: absolute tolerance used as threshold for gradient
-            comparison
-    :param rel_tol: relative tolerance used as threshold for gradient
-            comparison
-
-    :note: WARNING to unit-test writers: if `op` is a function that builds a
-        graph, try to make it a SMALL graph.  Often verify grad is run in
-        debug mode, which can be very slow if it has to verify a lot of
-        intermediate computations.
-
-    :note: This op does not support multiple outputs. In tests/test_scan.py
-        there is an experimental verify_grad that covers that case as well by
-        using random projections.
-    """
-    assert isinstance(pt, (list, tuple))
-    pt = [numpy.array(p) for p in pt]
-
-    for i, p in enumerate(pt):
-        if p.dtype not in ('float32', 'float64'):
-            raise TypeError(('verify_grad can work only with floating point '
-                'inputs, but input %i has dtype "%s".') % (i, p.dtype))
-
-    _type_tol = dict(  # relativ error tolerances for different types
-            float32=1e-2,
-            float64=1e-4)
-
-    if abs_tol is None:
-        abs_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
-    if rel_tol is None:
-        rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
-
-    if rng is None:
-        raise TypeError(('rng should be a valid instance of '
-                        'numpy.random.RandomState. You may '
-                         'want to use theano.tests.unittest'
-                         '_tools.verify_grad instead of '
-                         'theano.tensor.verify_grad.'))
-
-    # We allow input downcast in function, because numeric_grad works in the
-    # most precise dtype used among the inputs, so we may need to cast some.
-    def function(inputs, output):
-        if mode is None:
-            f = compile.function(inputs, output, accept_inplace=True,
-                    allow_input_downcast=True)
-        else:
-            f = compile.function(inputs, output, accept_inplace=True,
-                    allow_input_downcast=True, mode=mode)
-        return f
-
-    tensor_pt = [TensorType(
-            as_tensor_variable(p).dtype,
-            as_tensor_variable(p).broadcastable)(name='input %i' % i)
-        for i, p in enumerate(pt)]
-
-    #fun can be either a function or an actual Op instance
-    o_output = fun(*tensor_pt)
-
-    if isinstance(o_output, list):
-        raise NotImplementedError(('cant (yet) autotest gradient of fun '
-                                   'with multiple outputs'))
-        # we could make loop over outputs making random projections R for each,
-        # but this doesn't handle the case where not all the outputs are
-        # differentiable... so I leave this as TODO for now -JB.
-
-    o_fn = function(tensor_pt, o_output)
-    o_fn_out = o_fn(*[p.copy() for p in pt])
-
-    if isinstance(o_fn_out, tuple) or isinstance(o_fn_out, list):
-        raise TypeError('It seems like you are trying to use verify_grad '
-                'on an op or a function which outputs a list: there should'
-                ' be a single (array-like) output instead')
-
-    # random_projection should not have elements too small,
-    # otherwise too much precision is lost in numerical gradient
-    def random_projection():
-        plain = rng.rand(*o_fn_out.shape) + 0.5
-        if cast_to_output_type:
-            return numpy.array(plain, o_output.dtype)
-        return plain
-
-    t_r = shared(random_projection())
-
-    # random projection of o onto t_r
-    # This sum() is defined above, it's not the builtin sum.
-    cost = theano.tensor.sum(t_r * o_output)
-
-    cost_fn = function(tensor_pt, cost)
-
-    #todo-- determine if this is actually needed
-    g_cost = as_tensor_variable(1.0, name='g_cost')
-    if cast_to_output_type:
-        g_cost = cast(g_cost, o_output.dtype)
-
-    symbolic_grad = grad(cost, tensor_pt, g_cost,
-                         disconnected_inputs='ignore')
-
-    grad_fn = function(tensor_pt, symbolic_grad)
-
-    for test_num in xrange(n_tests):
-        num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps)
-
-        analytic_grad = grad_fn(*[p.copy() for p in pt])
-
-        # Since `tensor_pt` is a list, `analytic_grad` should be one too.
-        assert isinstance(analytic_grad, list)
-
-        max_arg, max_err_pos, max_abs_err, max_rel_err =\
-                num_grad.max_err(analytic_grad, abs_tol, rel_tol)
-
-        if max_abs_err > abs_tol and max_rel_err > rel_tol:
-            raise verify_grad.E_grad(max_arg, max_err_pos,
-                    max_abs_err, max_rel_err, abs_tol, rel_tol)
-
-        #get new random projection for next test
-        if test_num < n_tests - 1:
-            t_r.set_value(random_projection(), borrow=True)
-
-
-class GradientError(Exception):
-    """This error is raised when a gradient is calculated, but incorrect."""
-    def __init__(self, arg, err_pos, abs_err, rel_err, abs_tol, rel_tol):
-        self.arg = arg
-        self.err_pos = err_pos
-        self.abs_err = abs_err
-        self.rel_err = rel_err
-        self.abs_tol = abs_tol
-        self.rel_tol = rel_tol
-
-    def __str__(self):
-        # args may have been inserted by e.g. makeTester
-        args_msg = ", ".join(str(a) for a in self.args)
-        return """\
-GradientError: numeric gradient and analytic gradient exceed tolerance:
-        At position %i of argument %i,
-            abs. error = %f,  abs. tolerance = %f
-            rel. error = %f,  rel. tolerance = %f
-Exception args: %s""" % (self.err_pos, self.arg,
-                         self.abs_err, self.abs_tol,
-                         self.rel_err, self.rel_tol,
-                         args_msg)
-
-verify_grad.E_grad = GradientError
-
-
-def jacobian(expression, wrt, consider_constant=None, warn_type=False,
-             disconnected_inputs='raise'):
-    """
-    :type expression: Vector (1-dimensional) `Variable`
-    :type wrt: 'Variable' or list of `Variables`s
-
-    :param consider_constant: a list of expressions not to backpropagate
-        through
-
-    :param warn_type: a value of True will cause warnings to be logged for any
-        Op that emits a gradient that does not match its input type.
-
-    :type disconnected_inputs: string
-    :param disconnected_inputs: Defines the behaviour if some of the variables
-        in ``wrt`` are not part of the computational graph computing ``cost``
-        (or if all links are non-differentiable). The possible values are:
-        - 'ignore': considers that the gradient on these parameters is zero.
-        - 'warn': consider the gradient zero, and print a warning.
-        - 'raise': raise an exception.
-
-    :return: either a instance of `Variable` or list/tuple of `Variable`s
-            (depending upon `wrt`) repesenting the jacobian of `expression`
-            with respect to (elements of) `wrt`. If an element of `wrt` is not
-            differentiable with respect to the output, then a zero
-            variable is returned. The return value is of same type
-            as `wrt`: a list/tuple or TensorVariable in all cases.
-    """
-    # Check inputs have the right format
-    assert isinstance(expression, TensorVariable), \
-            "tensor.jacobian expects a Tensor Variable as `expression`"
-    assert expression.ndim < 2, \
-            ("tensor.jacobian expects a 1 dimensional variable as "
-             "`expression`. If not use flatten to make it a vector")
-
-    using_list = isinstance(wrt, list)
-    using_tuple = isinstance(wrt, tuple)
-
-    if isinstance(wrt, (list, tuple)):
-        wrt = list(wrt)
-    else:
-        wrt = [wrt]
-
-    if expression.ndim == 0:
-        # expression is just a scalar, use grad
-        return format_as(using_list, using_tuple, grad(expression, wrt))
-
-    def inner_function(*args):
-        idx = args[0]
-        expr = args[1]
-        rvals = []
-        for inp in args[2:]:
-            rval = grad(expr[idx],
-                     inp,
-                     consider_constant=consider_constant,
-                     warn_type=warn_type,
-                     disconnected_inputs=disconnected_inputs)
-            rvals.append(rval)
-        return rvals
-    # Computing the gradients does not affect the random seeds on any random
-    # generator used n expression (because during computing gradients we are
-    # just backtracking over old values. (rp Jan 2012 - if anyone has a
-    # counter example please show me)
-    jacobs, updates = theano.scan(inner_function,
-                            sequences=arange(expression.shape[0]),
-                            non_sequences=[expression] + wrt)
-    assert not updates, \
-            ("Scan has returned a list of updates. This should not "
-             "happen! Report this to theano-users (also include the "
-             "script that generated the error)")
-    return format_as(using_list, using_tuple, jacobs)
-
-
-def hessian(cost, wrt, consider_constant=None, warn_type=False,
-             disconnected_inputs='raise'):
-    """
-    :type cost: Scalar (0-dimensional) `Variable`
-    :type wrt: Vector (1-dimensional tensor) 'Variable' or list of
-            vectors (1-dimensional tensors) `Variable`s
-
-    :param consider_constant: a list of expressions not to backpropagate
-        through
-
-    :param warn_type: a value of True will cause warnings to be logged for any
-        Op that emits a gradient that does not match its input type.
-
-    :type disconnected_inputs: string
-    :param disconnected_inputs: Defines the behaviour if some of the variables
-        in ``wrt`` are not part of the computational graph computing ``cost``
-        (or if all links are non-differentiable). The possible values are:
-        - 'ignore': considers that the gradient on these parameters is zero.
-        - 'warn': consider the gradient zero, and print a warning.
-        - 'raise': raise an exception.
-
-    :return: either a instance of `Variable` or list/tuple of `Variable`s
-            (depending upon `wrt`) repressenting the Hessian of the `cost`
-            with respect to (elements of) `wrt`. If an element of `wrt` is not
-            differentiable with respect to the output, then a zero
-            variable is returned. The return value is of same type
-            as `wrt`: a list/tuple or TensorVariable in all cases.
-    """
-    # Check inputs have the right format
-    assert isinstance(cost, TensorVariable), \
-            "tensor.hessian expects a Tensor Variable as `cost`"
-    assert cost.ndim == 0, \
-            "tensor.hessian expects a 0 dimensional variable as `cost`"
-
-    using_list = isinstance(wrt, list)
-    using_tuple = isinstance(wrt, tuple)
-
-    if isinstance(wrt, (list, tuple)):
-        wrt = list(wrt)
-    else:
-        wrt = [wrt]
-
-    hessians = []
-    for input in wrt:
-        assert isinstance(input, TensorVariable), \
-                "tensor.hessian expects a (list of) Tensor Variable as `wrt`"
-        assert input.ndim == 1, \
-                "tensor.hessian expects a (list of) 1 dimensional variable "\
-                "as `wrt`"
-        expr = grad(cost, input)
-        hess, updates = theano.scan(lambda i, y, x: grad(
-                            y[i],
-                            x,
-                            consider_constant=consider_constant,
-                            warn_type=warn_type,
-                            disconnected_inputs=disconnected_inputs),
-                       sequences=arange(expr.shape[0]),
-                       non_sequences=[expr, input])
-        assert not updates, \
-                ("Scan has returned a list of updates. This should not "
-                 "happen! Report this to theano-users (also include the "
-                 "script that generated the error)")
-        hessians.append(hess)
-    return format_as(using_list, using_tuple, hessians)
--- a/theano/tensor/tests/test_2nd_order_grads.py
+++ b/theano/tensor/tests/test_2nd_order_grads.py
--- a/theano/tensor/tests/test_rop.py
+++ b/theano/tensor/tests/test_rop.py