Merge pull request #144 from jaberg/fix_div_by_zero_in_tensor_grad

Fix div by zero in tensor grad

Merge pull request #144 from jaberg/fix_div_by_zero_in_tensor_grad
72d839e5 · nouiz · 4c7b5d1f · 7896a7e9 · 72d839e5
--- a/theano/tensor/tensor_grad.py
+++ b/theano/tensor/tensor_grad.py
@@ -333,28 +333,34 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
            return ret
-class numeric_grad:
+class numeric_grad(object):
-    """WRITEME"""
+    """
+    Compute the numeric derivative of a scalar-valued function at a particular
+    point.
+    """
    # Note on step sizes and tolerances:
    #
-    # There is a relationship between the step size and the function value and the measurement
+    # There is a relationship between the step size and the function value and
-    # error that is incurred due to rounding.  The finite difference we measure is
+    # the measurement error that is incurred due to rounding.  The finite
-    # delta = f(x0) - f(x0+eps)
+    # difference we measure is delta = f(x0) - f(x0+eps)
    #
    # For maximum precision, f should be close to zero.
-    # For every power of 2 that f departs from zero, we lose a bit of precision in delta.
+    # For every power of 2 that f departs from zero, we lose a bit of
+    # precision in delta.
    #
-    # Even in this case of maximum accuracy, there is a tradeoff between stepsize and
+    # Even in this case of maximum accuracy, there is a tradeoff between
-    # measurement error.
+    # stepsize and measurement error.  Taking small steps allows us to measure
-    # Taking small steps allows us to measure large derivatives accuractly, but longer steps
+    # large derivatives accuractly, but longer steps are required to measure
-    # are required to measure small derivatives accurately.  However longer steps introduce
+    # small derivatives accurately.  However longer steps introduce bias into
-    # bias into our measurement in general for non-linear functions.
+    # our measurement in general for non-linear functions.
    #
-    # It would be interesting to have a version of numeric grad that used an adaptive stepsize.
+    # It would be interesting to have a version of numeric grad that used an
+    # adaptive stepsize.
    #
-    # For now, we use a heuristic that catches very bad gradients, but is not perfectly
+    # For now, we use a heuristic that catches very bad gradients, but is not
-    # accurate.
+    # perfectly accurate.
    type_eps = {'float64': 1e-7,
            'float32': 3e-4,
            numpy.dtype('float64'):1e-7,
@@ -363,6 +369,9 @@ class numeric_grad:
    def __init__(self, f, pt, eps=None):
        """Return the gradient of f at pt.
+        :param f: a differentiable function such that f(*pt) is a scalar
+        :param pt: an ndarray, a list of ndarrays or tuple of ndarrays
        This function computes the gradient by a one-sided finite differences of a
        fixed step size (eps).
@@ -435,39 +444,46 @@ class numeric_grad:
            self.gf = self.gf[0]
    @staticmethod
-    def abs_rel_err(a,b):
+    def abs_rel_err(a, b):
        """Return absolute and relative error between a and b.
-        The relative error is a small number when a and b are close, relative to how big they are.
+        The relative error is a small number when a and b are close, relative
+        to how big they are.
        Formulas used:
            abs_err = abs(a - b)
-            rel_err = abs_err / (abs(a) + abs(b))
+            rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
+        The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
+        are both close to 0.
        The tuple (abs_err, rel_err) is returned
        """
-        abs_err = abs(a-b)
+        abs_err = abs(a - b)
-        rel_err = abs_err / (abs(a) + abs(b))
+        rel_err = abs_err / numpy.maximum(abs(a) + abs(b), 1e-8)
        return (abs_err, rel_err)
    def abs_rel_errors(self, g_pt):
        """Return the abs and rel error of gradient estimate `g_pt`
-        `g_pt` must be a list of ndarrays of the same length as self.gf, otherwise a ValueError
+        `g_pt` must be a list of ndarrays of the same length as self.gf,
-        is raised.
+        otherwise a ValueError is raised.
-        Corresponding ndarrays in `g_pt` and `self.gf` must have the same shape or ValueError
+        Corresponding ndarrays in `g_pt` and `self.gf` must have the same
-        is raised.
+        shape or ValueError is raised.
        """
        if len(g_pt) != len(self.gf):
-            raise ValueError('argument has wrong number of elements', len(g_pt))
+            raise ValueError(
+                    'argument has wrong number of elements',
+                    len(g_pt))
        errs = []
        for i, (a, b) in enumerate(zip(g_pt, self.gf)):
            if a.shape != b.shape:
-                raise ValueError('argument element %i has wrong shape %s' %(i,str((a.shape,
+                raise ValueError(
-                    b.shape))))
+                        'argument element %i has wrong shape %s' % (
-            errs.append(numeric_grad.abs_rel_err(a,b))
+                            i, str((a.shape, b.shape))))
+            errs.append(numeric_grad.abs_rel_err(a, b))
        return errs
    def max_err(self, g_pt, abs_tol, rel_tol):
@@ -477,8 +493,8 @@ class numeric_grad:
        wrt the provided tolerances (abs_tol, rel_tol).
        A value > 1 means both tolerances are exceeded.
-        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over g_pt,
+        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over
-        as well as abs_err and rel_err at this point.
+        g_pt, as well as abs_err and rel_err at this point.
        """
        pos = []
        errs = []
@@ -487,7 +503,11 @@ class numeric_grad:
        abs_rel_errs = self.abs_rel_errors(g_pt)
        for abs_err, rel_err in abs_rel_errs:
-            scaled_err = numpy.minimum(abs_err/abs_tol, rel_err/rel_tol)
+            if not numpy.all(numpy.isfinite(abs_err)):
+                raise ValueError('abs_err not finite', repr(abs_err))
+            if not numpy.all(numpy.isfinite(rel_err)):
+                raise ValueError('rel_err not finite', repr(rel_err))
+            scaled_err = numpy.minimum(abs_err / abs_tol, rel_err / rel_tol)
            max_i = scaled_err.argmax()
            pos.append(max_i)
@@ -501,8 +521,8 @@ class numeric_grad:
        return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
-def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=None,
+def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
-        mode=None, cast_to_output_type=False):
+        rel_tol=None, mode=None, cast_to_output_type=False):
    """ Test a gradient by Finite Difference Method. Raise error on failure.
    Example:
@@ -511,9 +531,9 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
                    rng=numpy.random)
    Raises an Exception if the difference between the analytic gradient and
-    numerical gradient (computed through the Finite Difference Method) of a random
+    numerical gradient (computed through the Finite Difference Method) of a
-    projection of the fun's output to a scalar exceeds
+    random projection of the fun's output to a scalar exceeds the given
-    the given tolerance.
+    tolerance.
    :param fun: a Python function that takes Theano variables as inputs,
        and returns a Theano variable. For instance, an Op instance with
@@ -521,19 +541,21 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
    :param pt: the list of numpy.ndarrays to use as input values.
        These arrays must be either float32 or float64 arrays.
    :param n_tests: number of times to run the test
-    :param rng: random number generator used to sample u, we test gradient of sum(u * fun) at pt
+    :param rng: random number generator used to sample u, we test gradient of
-    :param eps: stepsize used in the Finite Difference Method (Default None is type-dependent)
+        sum(u * fun) at pt
+    :param eps: stepsize used in the Finite Difference Method (Default None is
+        type-dependent)
    :param abs_tol: absolute tolerance used as threshold for gradient comparison
    :param rel_tol: relative tolerance used as threshold for gradient comparison
-    :note: WARNING to unit-test writers: if `op` is a function that builds a graph,
+    :note: WARNING to unit-test writers: if `op` is a function that builds a
-           try to make it a SMALL graph.  Often verify grad is run in
+        graph, try to make it a SMALL graph.  Often verify grad is run in
-           debug mode, which can be very slow if it has to verify a lot
+        debug mode, which can be very slow if it has to verify a lot of
-           of intermediate computations.
+        intermediate computations.
-    :note: This op does not support multiple outputs. In tests/test_scan.py there is
+    :note: This op does not support multiple outputs. In tests/test_scan.py
-           an experimental verify_grad that covers that case as well by using random
+        there is an experimental verify_grad that covers that case as well by
-           projections.
+        using random projections.
    """
    assert isinstance(pt, (list,tuple))
    pt = [numpy.array(p) for p in pt]
@@ -553,8 +575,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
        rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
    if rng is None:
-        raise TypeError('rng should be a valid instance of numpy.random.RandomState.',
+        raise TypeError('rng be instance of numpy.random.RandomState', (
-                'You may want to use theano.tests.unittest_tools.verify_grad instead of theano.tensor.verify_grad.')
+                '  hint: Maybe you meant to call'
+                '        theano.tests.unittest_tools.verify_grad instead of'
+                '        theano.tensor.verify_grad.'))
    # We allow input downcast in function, because numeric_grad works in the
    # most precise dtype used among the inputs, so we may need to cast some.
@@ -567,15 +591,18 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
                    allow_input_downcast=True, mode=mode)
        return f
-    tensor_pt = [TensorType(as_tensor_variable(p).dtype, as_tensor_variable(p).broadcastable)(name='input %i'%i) for i,p in enumerate(pt)]
+    tensor_pt = [TensorType(
+            as_tensor_variable(p).dtype,
+            as_tensor_variable(p).broadcastable)(name='input %i'%i)
+        for i, p in enumerate(pt)]
    #fun can be either a function or an actual Op instance
    o_output = fun(*tensor_pt)
-    if isinstance(o_output,list):
+    if isinstance(o_output, list):
-        raise NotImplementedError('cant (yet) autotest gradient of fun with multiple outputs')
+        raise NotImplementedError('verify gradient on multiple outputs')
-        # we could make loop over outputs making random projections R for each,
+        # we could make loop over outputs making random projections R for
-        # but this doesn't handle the case where not all the outputs are
+        # each, but this doesn't handle the case where not all the outputs are
        # differentiable... so I leave this as TODO for now -JB.
    o_fn = function(tensor_pt, o_output)
@@ -597,18 +624,16 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
    t_r = shared(random_projection())
    #random projection of o onto t_r
-    cost = theano.tensor.sum(t_r * o_output)  #This sum() is defined above, it's not the builtin sum.
+    cost = theano.tensor.sum(t_r * o_output)
    cost_fn = function(tensor_pt, cost)
    #todo-- determine if this is actually needed
-    g_cost = as_tensor_variable(1.0,name='g_cost')
+    g_cost = as_tensor_variable(1.0, name='g_cost')
    if cast_to_output_type:
        g_cost = cast(g_cost, o_output.dtype)
    symbolic_grad = grad(cost, tensor_pt, g_cost,
                         disconnected_inputs='ignore')
-    #if o_output.dtype in ['float32','float64']:
-    #    assert all([x.dtype == o_output.dtype for x in symbolic_grad]),("Expected grad of type %s, got %s "%( symbolic_grad.dtype, o_output.dtyp))
    grad_fn = function(tensor_pt, symbolic_grad)