Merge branch 'master' of github.com:Theano/Theano

94a52915 · James Bergstra · 73103598 · 03e6e398 · 94a52915 · 94a52915
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -294,6 +294,9 @@ class BadOptimization(DebugModeError):
        print >> sio, self.old_graph
        print >> sio, "  New Graph:"
        print >> sio, self.new_graph
+        print >> sio, ""
+        print >> sio, "Hint: relax the tolerance by setting tensor.cmp_sloppy=1"
+        print >> sio, "  or even tensor.cmp_sloppy=2 for less-strict comparison"
        return sio.getvalue()
 class BadDestroyMap(DebugModeError):

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
+import logging
+logger = logging.getLogger(__name__)
 import numpy
 from theano.gof import Op, Apply
@@ -57,6 +60,7 @@ def hints(variable):
 @local_optimizer([])
 def remove_hint_nodes(node):
    if is_hint_node(node):
+        # transfer hints from graph to Feature
        try:
            for k,v in node.op.hints:
                node.env.hints_feature.add_hint(node.inputs[0], k, v)
@@ -95,7 +99,7 @@ class HintsFeature(object):
    """
    def add_hint(self, r, k, v):
-        print 'adding hint', r, k, v
+        logger.debug('adding hint; %s, %s, %s' % (r, k, v))
        self.hints[r][k] = v
    def ensure_init_r(self, r):
@@ -171,9 +175,8 @@ def is_positive(v):
        return True
    #TODO: how to handle this - a registry?
    #      infer_hints on Ops?
-    print 'is_positive', v
+    logger.debug('is_positive: %s' % str(v))
    if v.owner and v.owner.op == tensor.pow:
-        print 'try for pow', v, v.owner.inputs
        try:
            exponent = tensor.get_constant_value(v.owner.inputs[1])
        except TypeError:
@@ -250,7 +253,6 @@ def local_log_prod_sqr(node):
            # we cannot always make this substitution because
            # the prod might include negative terms
            p = x.owner.inputs[0]
-            print "AAA", p
            # p is the matrix we're reducing with prod
            if is_positive(p):
@@ -316,7 +318,7 @@ class Cholesky(Op):
            destr = 'destructive'
        else:
            destr = 'non-destructive'
-        return 'Cholesky{%s,%s}'% (lu,destr)
+        return 'Cholesky{%s,%s}' % (lu, destr)
    def make_node(self, x):
        x = as_tensor_variable(x)
        return Apply(self, [x], [x.type()])
@@ -378,7 +380,10 @@ class Solve(Op):
    def make_node(self, A, b):
        A = as_tensor_variable(A)
        b = as_tensor_variable(b)
-        return Apply(self, [A,b], [b.type()])
+        otype = tensor.tensor(
+                broadcastable=b.broadcastable,
+                dtype = (A*b).dtype)
+        return Apply(self, [A,b], [otype])
    def perform(self, node, inputs, output_storage):
        A, b = inputs
        #TODO: use the A_structure to go faster
@@ -394,40 +399,46 @@ class ExtractDiag(Op):
        self.view = view
        if self.view:
            self.view_map = {0:[0]}
-            self.perform = self.perform_view
-        else:
-            self.perform = self.perform_noview
    def __eq__(self, other):
        return type(self) == type(other) and self.view == other.view
    def __hash__(self):
        return hash(type(self))^hash(self.view)
    def make_node(self, _x):
        x = as_tensor_variable(_x)
        if x.type.ndim != 2:
            raise TypeError('ExtractDiag only works on matrices', _x)
        return Apply(self, [x], [tensor.vector(dtype=x.type.dtype)])
-    def perform_noview(self, node, (x,), (z,)):
+    def perform(self, node, ins, outs):
+        x, = ins
+        z, = outs
        #for some reason numpy.diag(x) is really slow
        N,M = x.shape
        assert N==M
        rval = x[0]
        rval.strides = (x.strides[0]+x.strides[1],)
-        z[0] = rval.copy()
+        if self.view:
-    def perform_view(self, node, (x,), (z,)):
-        N,M = x.shape
-        a,b = x.strides
-        assert N==M
-        rval = x[0]
-        rval.strides = a+b,
            z[0] = rval
+        else:
+            z[0] = rval.copy()
    def __str__(self):
        return 'ExtractDiag{view=%s}'%self.view
    def grad(self, inputs, g_outputs):
        return [alloc_diag(g_outputs[0])]
-extract_diag = ExtractDiag()
+    def infer_shape(self, node, shapes):
+        x_s, = shapes
+        return [(x_s[0],)]
+extract_diag = ExtractDiag()
 #TODO: optimization to insert ExtractDiag with view=True
 class AllocDiag(Op):
    def __eq__(self, other):
        return type(self) == type(other)
@@ -449,7 +460,10 @@ alloc_diag = AllocDiag()
 def diag(x):
    """Numpy-compatibility method
-    For vector `x`, return a zero matrix except for `x` as diagonal.
+    If `x` is a matrix, return its diagonal.
+    If `x` is a vector return a matrix with it as its diagonal.
+    * This method does not support the `k` argument that numpy supports.
    """
    xx = as_tensor_variable(x)
    if xx.type.ndim == 1:
@@ -461,6 +475,7 @@ def diag(x):
 class Det(Op):
    """matrix determinant
    TODO: move this op to another file that request scipy.
    """
    def make_node(self, x):
@@ -488,6 +503,7 @@ def trace(X):
    """
    return extract_diag(X).sum()
 def spectral_radius_bound(X, log2_exponent):
    """
    Returns upper bound on the largest eigenvalue of square symmetrix matrix X.

--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
+from pkg_resources import parse_version as V
 import numpy
 import theano
 from theano import tensor, function
 from theano.tensor.basic import _allclose
+from theano.tests import unittest_tools as utt
+from theano import config
+utt.seed_rng()
 try:
    import scipy
-    if scipy.__version__ < '0.7':
+    if V(scipy.__version__) < V('0.7'):
        raise ImportError()
    use_scipy = True
 except ImportError:
@@ -17,11 +23,12 @@ from theano.sandbox.linalg.ops import (cholesky,
                                       matrix_inverse,
                                       #solve,
                                       #diag,
-                                       #extract_diag,
+                                       ExtractDiag,
+                                       extract_diag,
                                       #alloc_diag,
                                       det,
                                       #PSD_hint,
-                                       #trace,
+                                       trace,
                                       #spectral_radius_bound
                                       )
@@ -88,3 +95,61 @@ def test_det_grad():
    r = rng.randn(5,5)
    tensor.verify_grad(det, [r], rng=numpy.random)
+def test_extract_diag():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.tensor.matrix()
+    g = extract_diag(x)
+    f = theano.function([x], g)
+    m = rng.rand(3,3).astype(config.floatX)
+    v = numpy.diag(m)
+    r = f(m)
+    # The right diagonal is extracted
+    assert (r == v).all()
+    m = rng.rand(2, 3).astype(config.floatX)
+    ok = False
+    try:
+        r = f(m)
+    except Exception:
+        ok = True
+    assert ok
+    xx = theano.tensor.vector()
+    ok = False
+    try:
+        extract_diag(xx)
+    except TypeError:
+        ok = True
+    assert ok
+    f = theano.function([x], g.shape)
+    topo = f.maker.env.toposort()
+    assert sum([node.op.__class__ == ExtractDiag for node in topo]) == 0
+    m = rng.rand(3,3).astype(config.floatX)
+    assert f(m) == 3
+# not testing the view=True case since it is not used anywhere.
+def test_trace():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.tensor.matrix()
+    g = trace(x)
+    f = theano.function([x], g)
+    m = rng.rand(4, 4).astype(config.floatX)
+    v = numpy.trace(m)
+    assert v == f(m)
+    xx = theano.tensor.vector()
+    ok = False
+    try:
+        trace(xx)
+    except TypeError:
+        ok = True
+    assert ok
--- a/theano/sparse/__init__.py
+++ b/theano/sparse/__init__.py
+from pkg_resources import parse_version as V
 import sys
 try:
    import scipy
-    enable_sparse = scipy.__version__ >= '0.7'
+    enable_sparse = V(scipy.__version__) >= V('0.7')
    if not enable_sparse:
        sys.stderr.write("WARNING: scipy version = %s."
                " We request version >=0.7.0 for the sparse code as it has"

--- a/theano/tensor/tensor_grad.py
+++ b/theano/tensor/tensor_grad.py
@@ -333,28 +333,34 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
            return ret
-class numeric_grad:
+class numeric_grad(object):
-    """WRITEME"""
+    """
+    Compute the numeric derivative of a scalar-valued function at a particular
+    point.
+    """
    # Note on step sizes and tolerances:
    #
-    # There is a relationship between the step size and the function value and the measurement
+    # There is a relationship between the step size and the function value and
-    # error that is incurred due to rounding.  The finite difference we measure is
+    # the measurement error that is incurred due to rounding.  The finite
-    # delta = f(x0) - f(x0+eps)
+    # difference we measure is delta = f(x0) - f(x0+eps)
    #
    # For maximum precision, f should be close to zero.
-    # For every power of 2 that f departs from zero, we lose a bit of precision in delta.
+    # For every power of 2 that f departs from zero, we lose a bit of
+    # precision in delta.
    #
-    # Even in this case of maximum accuracy, there is a tradeoff between stepsize and
+    # Even in this case of maximum accuracy, there is a tradeoff between
-    # measurement error.
+    # stepsize and measurement error.  Taking small steps allows us to measure
-    # Taking small steps allows us to measure large derivatives accuractly, but longer steps
+    # large derivatives accuractly, but longer steps are required to measure
-    # are required to measure small derivatives accurately.  However longer steps introduce
+    # small derivatives accurately.  However longer steps introduce bias into
-    # bias into our measurement in general for non-linear functions.
+    # our measurement in general for non-linear functions.
    #
-    # It would be interesting to have a version of numeric grad that used an adaptive stepsize.
+    # It would be interesting to have a version of numeric grad that used an
+    # adaptive stepsize.
    #
-    # For now, we use a heuristic that catches very bad gradients, but is not perfectly
+    # For now, we use a heuristic that catches very bad gradients, but is not
-    # accurate.
+    # perfectly accurate.
    type_eps = {'float64': 1e-7,
            'float32': 3e-4,
            numpy.dtype('float64'):1e-7,
@@ -363,6 +369,9 @@ class numeric_grad:
    def __init__(self, f, pt, eps=None):
        """Return the gradient of f at pt.
+        :param f: a differentiable function such that f(*pt) is a scalar
+        :param pt: an ndarray, a list of ndarrays or tuple of ndarrays
        This function computes the gradient by a one-sided finite differences of a
        fixed step size (eps).
@@ -435,39 +444,46 @@ class numeric_grad:
            self.gf = self.gf[0]
    @staticmethod
-    def abs_rel_err(a,b):
+    def abs_rel_err(a, b):
        """Return absolute and relative error between a and b.
-        The relative error is a small number when a and b are close, relative to how big they are.
+        The relative error is a small number when a and b are close, relative
+        to how big they are.
        Formulas used:
            abs_err = abs(a - b)
-            rel_err = abs_err / (abs(a) + abs(b))
+            rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
+        The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
+        are both close to 0.
        The tuple (abs_err, rel_err) is returned
        """
-        abs_err = abs(a-b)
+        abs_err = abs(a - b)
-        rel_err = abs_err / (abs(a) + abs(b))
+        rel_err = abs_err / numpy.maximum(abs(a) + abs(b), 1e-8)
        return (abs_err, rel_err)
    def abs_rel_errors(self, g_pt):
        """Return the abs and rel error of gradient estimate `g_pt`
-        `g_pt` must be a list of ndarrays of the same length as self.gf, otherwise a ValueError
+        `g_pt` must be a list of ndarrays of the same length as self.gf,
-        is raised.
+        otherwise a ValueError is raised.
-        Corresponding ndarrays in `g_pt` and `self.gf` must have the same shape or ValueError
+        Corresponding ndarrays in `g_pt` and `self.gf` must have the same
-        is raised.
+        shape or ValueError is raised.
        """
        if len(g_pt) != len(self.gf):
-            raise ValueError('argument has wrong number of elements', len(g_pt))
+            raise ValueError(
+                    'argument has wrong number of elements',
+                    len(g_pt))
        errs = []
        for i, (a, b) in enumerate(zip(g_pt, self.gf)):
            if a.shape != b.shape:
-                raise ValueError('argument element %i has wrong shape %s' %(i,str((a.shape,
+                raise ValueError(
-                    b.shape))))
+                        'argument element %i has wrong shape %s' % (
-            errs.append(numeric_grad.abs_rel_err(a,b))
+                            i, str((a.shape, b.shape))))
+            errs.append(numeric_grad.abs_rel_err(a, b))
        return errs
    def max_err(self, g_pt, abs_tol, rel_tol):
@@ -477,8 +493,8 @@ class numeric_grad:
        wrt the provided tolerances (abs_tol, rel_tol).
        A value > 1 means both tolerances are exceeded.
-        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over g_pt,
+        Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over
-        as well as abs_err and rel_err at this point.
+        g_pt, as well as abs_err and rel_err at this point.
        """
        pos = []
        errs = []
@@ -487,7 +503,11 @@ class numeric_grad:
        abs_rel_errs = self.abs_rel_errors(g_pt)
        for abs_err, rel_err in abs_rel_errs:
-            scaled_err = numpy.minimum(abs_err/abs_tol, rel_err/rel_tol)
+            if not numpy.all(numpy.isfinite(abs_err)):
+                raise ValueError('abs_err not finite', repr(abs_err))
+            if not numpy.all(numpy.isfinite(rel_err)):
+                raise ValueError('rel_err not finite', repr(rel_err))
+            scaled_err = numpy.minimum(abs_err / abs_tol, rel_err / rel_tol)
            max_i = scaled_err.argmax()
            pos.append(max_i)
@@ -501,8 +521,8 @@ class numeric_grad:
        return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
-def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=None,
+def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
-        mode=None, cast_to_output_type=False):
+        rel_tol=None, mode=None, cast_to_output_type=False):
    """ Test a gradient by Finite Difference Method. Raise error on failure.
    Example:
@@ -511,9 +531,9 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
                    rng=numpy.random)
    Raises an Exception if the difference between the analytic gradient and
-    numerical gradient (computed through the Finite Difference Method) of a random
+    numerical gradient (computed through the Finite Difference Method) of a
-    projection of the fun's output to a scalar exceeds
+    random projection of the fun's output to a scalar exceeds the given
-    the given tolerance.
+    tolerance.
    :param fun: a Python function that takes Theano variables as inputs,
        and returns a Theano variable. For instance, an Op instance with
@@ -521,19 +541,21 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
    :param pt: the list of numpy.ndarrays to use as input values.
        These arrays must be either float32 or float64 arrays.
    :param n_tests: number of times to run the test
-    :param rng: random number generator used to sample u, we test gradient of sum(u * fun) at pt
+    :param rng: random number generator used to sample u, we test gradient of
-    :param eps: stepsize used in the Finite Difference Method (Default None is type-dependent)
+        sum(u * fun) at pt
+    :param eps: stepsize used in the Finite Difference Method (Default None is
+        type-dependent)
    :param abs_tol: absolute tolerance used as threshold for gradient comparison
    :param rel_tol: relative tolerance used as threshold for gradient comparison
-    :note: WARNING to unit-test writers: if `op` is a function that builds a graph,
+    :note: WARNING to unit-test writers: if `op` is a function that builds a
-           try to make it a SMALL graph.  Often verify grad is run in
+        graph, try to make it a SMALL graph.  Often verify grad is run in
-           debug mode, which can be very slow if it has to verify a lot
+        debug mode, which can be very slow if it has to verify a lot of
-           of intermediate computations.
+        intermediate computations.
-    :note: This op does not support multiple outputs. In tests/test_scan.py there is
+    :note: This op does not support multiple outputs. In tests/test_scan.py
-           an experimental verify_grad that covers that case as well by using random
+        there is an experimental verify_grad that covers that case as well by
-           projections.
+        using random projections.
    """
    assert isinstance(pt, (list,tuple))
    pt = [numpy.array(p) for p in pt]
@@ -553,8 +575,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
        rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
    if rng is None:
-        raise TypeError('rng should be a valid instance of numpy.random.RandomState.',
+        raise TypeError('rng be instance of numpy.random.RandomState', (
-                'You may want to use theano.tests.unittest_tools.verify_grad instead of theano.tensor.verify_grad.')
+                '  hint: Maybe you meant to call'
+                '        theano.tests.unittest_tools.verify_grad instead of'
+                '        theano.tensor.verify_grad.'))
    # We allow input downcast in function, because numeric_grad works in the
    # most precise dtype used among the inputs, so we may need to cast some.
@@ -567,15 +591,18 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
                    allow_input_downcast=True, mode=mode)
        return f
-    tensor_pt = [TensorType(as_tensor_variable(p).dtype, as_tensor_variable(p).broadcastable)(name='input %i'%i) for i,p in enumerate(pt)]
+    tensor_pt = [TensorType(
+            as_tensor_variable(p).dtype,
+            as_tensor_variable(p).broadcastable)(name='input %i'%i)
+        for i, p in enumerate(pt)]
    #fun can be either a function or an actual Op instance
    o_output = fun(*tensor_pt)
-    if isinstance(o_output,list):
+    if isinstance(o_output, list):
-        raise NotImplementedError('cant (yet) autotest gradient of fun with multiple outputs')
+        raise NotImplementedError('verify gradient on multiple outputs')
-        # we could make loop over outputs making random projections R for each,
+        # we could make loop over outputs making random projections R for
-        # but this doesn't handle the case where not all the outputs are
+        # each, but this doesn't handle the case where not all the outputs are
        # differentiable... so I leave this as TODO for now -JB.
    o_fn = function(tensor_pt, o_output)
@@ -597,18 +624,16 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
    t_r = shared(random_projection())
    #random projection of o onto t_r
-    cost = theano.tensor.sum(t_r * o_output)  #This sum() is defined above, it's not the builtin sum.
+    cost = theano.tensor.sum(t_r * o_output)
    cost_fn = function(tensor_pt, cost)
    #todo-- determine if this is actually needed
-    g_cost = as_tensor_variable(1.0,name='g_cost')
+    g_cost = as_tensor_variable(1.0, name='g_cost')
    if cast_to_output_type:
        g_cost = cast(g_cost, o_output.dtype)
    symbolic_grad = grad(cost, tensor_pt, g_cost,
                         disconnected_inputs='ignore')
-    #if o_output.dtype in ['float32','float64']:
-    #    assert all([x.dtype == o_output.dtype for x in symbolic_grad]),("Expected grad of type %s, got %s "%( symbolic_grad.dtype, o_output.dtyp))
    grad_fn = function(tensor_pt, symbolic_grad)