merge

7048569d · boulanni · d4ff7188 · c2905e7b · 7048569d · 7048569d
--- a/theano/linalg/__init__.py
+++ b/theano/linalg/__init__.py
+from ops import (cholesky, matrix_inverse, solve,
+        diag, extract_diag, alloc_diag,
+        det, PSD_hint,
+        trace, spectral_radius_bound)
--- a/theano/linalg/ops.py
+++ b/theano/linalg/ops.py
+import numpy
+from theano.gof import Variable, Op, utils, Type, Constant,  Value
+from theano.tensor.tsor_apply import Apply
+from theano.tensor import as_tensor_variable, dot, DimShuffle
+from theano import tensor
+import theano.tensor
+from theano.tensor.opt import (register_stabilize,
+        register_specialize, register_canonicalize)
+from theano.gof import local_optimizer
+from theano.gof.opt import Optimizer
+try:
+    import scipy.linalg
+except:
+    pass # some ops (e.g. Cholesky) won't work
+class Hint(Op):
+    """
+    Provide arbitrary information to the optimizer
+    These ops are removed from the graph during canonicalization
+    in order to not interfere with other optimizations.
+    The idea is that prior to canonicalization, one or more Features of the env should
+    register the information contained in any Hint node, and transfer that information out of
+    the graph.
+    """
+    def __init__(self, **kwargs):
+        self.hints = tuple(kwargs.items())
+        self.view_map = {0:[0]}
+    def __eq__(self, other):
+        return type(self) == type(other) and self.hints == other.hints
+    def __hash__(self):
+        return hash((type(self), self.hints))
+    def make_node(self, x):
+        return Apply(self, [x], [x.type()])
+    def perform(self, node, inputs, outstor):
+        outstor[0][0] = inputs[0]
+    def grad(self, inputs, g_out):
+        return g_out
+def is_hint_node(node):
+    return isinstance(node.op, Hint)
+def hints(variable):
+    if hasattr(variable, 'env'):
+        try:
+            return variable.env.hints_feature.hints[variable]
+        except AttributeError:
+            return {}
+    else:
+        if is_hint_node(variable.owner):
+            return dict(variable.owner.op.hints)
+        else:
+            return {}
+@register_canonicalize
+@local_optimizer([])
+def remove_hint_nodes(node):
+    if is_hint_node(node):
+        try:
+            for k,v in node.op.hints:
+                node.env.hints_feature.add_hint(node.inputs[0], k, v)
+        except AttributeError:
+            pass
+        return node.inputs
+class HintsFeature(object):
+    """
+    Env Feature to track matrix properties
+    This is a similar feature to variable 'tags'. In fact, tags are one way to provide hints.
+    This class exists because tags were not documented well, and the semantics of how tag
+    information should be moved around during optimizations was never clearly spelled out.
+    Hints are assumptions about mathematical properties of variables.
+    If one variable is substituted for another by an optimization,
+    then it means that the assumptions should be transferred to the new variable.
+    Hints are attached to 'positions in a graph' rather than to variables in particular,
+    although Hints are originally attached to a particular positition in a graph *via* a
+    variable in that original graph.
+    Examples of hints are:
+    - shape information
+    - matrix properties (e.g. symmetry, psd, banded, diagonal)
+    Hint information is propagated through the graph similarly to graph optimizations,
+    except that adding a hint does not change the graph.  Adding a hint is not something that
+    debugmode will check.
+    #TODO: should a Hint be an object that can actually evaluate its truthfulness?
+    #      Should the PSD property be an object that can check the PSD-ness of a variable?
+    """
+    def add_hint(self, r, k, v):
+        print 'adding hint', r, k, v
+        self.hints[r][k] = v
+    def ensure_init_r(self, r):
+        if r not in self.hints:
+            self.hints[r] = {}
+    #
+    #
+    # Feature inteface
+    #
+    #
+    def on_attach(self, env):
+        assert not hasattr(env, 'hints_feature')
+        env.hints_feature = self
+        self.hints = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+        for node in env.toposort():
+            self.on_import(env, node)
+    def on_import(self, env, node):
+        if node.outputs[0] in self.hints:
+            # this is a revert, not really an import
+            for r in node.outputs + node.inputs:
+                assert r in self.hints
+            return
+        for i, r in enumerate(node.inputs + node.outputs):
+            # make sure we have shapes for the inputs
+            self.ensure_init_r(r)
+    def update_second_from_first(self, r0, r1):
+        old_hints = self.hints[r0]
+        new_hints = self.hints[r1]
+        for k,v in old_hints.items():
+            if k in new_hints and new_hints[k] is not v:
+                raise NotImplementedError()
+            if k not in new_hints:
+                new_hints[k] = v
+    def on_change_input(self, env, node, i, r, new_r):
+        # TODO:
+        # This tells us that r and new_r must have the same shape
+        # if we didn't know that the shapes are related, now we do.
+        self.ensure_init_r(new_r)
+        self.update_second_from_first(r, new_r)
+        self.update_second_from_first(new_r, r)
+        # change_input happens in two cases:
+        # 1) we are trying to get rid of r, or
+        # 2) we are putting things back after a failed transaction.
+class HintsOptimizer(Optimizer):
+    """Optimizer that serves to add HintsFeature as an env feature.
+    """
+    def __init__(self):
+        Optimizer.__init__(self)
+    def add_requirements(self, env):
+        env.extend(HintsFeature())
+    def apply(self, env):
+        pass
+# -1 should make it run right before the first merge
+theano.compile.mode.optdb.register('HintsOpt', HintsOptimizer(), -1, 'fast_run', 'fast_compile')
+def PSD_hint(v):
+    return Hint(psd=True,symmetric=True)(v)
+def is_psd(v):
+    return hints(v).get('psd', False)
+def is_symmetric(v):
+    return hints(v).get('symmetric', False)
+def is_positive(v):
+    if hints(v).get('positive', False):
+        return True
+    #TODO: how to handle this - a registry?
+    #      infer_hints on Ops?
+    print 'is_positive', v
+    if v.owner and v.owner.op == tensor.pow:
+        print 'try for pow', v, v.owner.inputs
+        try: 
+            exponent = tensor.get_constant_value(v.owner.inputs[1])
+        except TypeError:
+            return False
+        if 0 == exponent % 2:
+            return True
+    return False
+@register_stabilize
+@local_optimizer([])
+def inv_as_solve(node):
+    if node.op == dot:
+        l,r = node.inputs
+        if l.owner and l.owner.op == matrix_inverse:
+            return [solve(l.owner.inputs[0], r)]
+        if r.owner and r.owner.op == matrix_inverse:
+            if is_symmetric(r.owner.inputs[0]):
+                return [solve(r.owner.inputs[0], l.T).T]
+            else:
+                return [solve(r.owner.inputs[0].T, l.T).T]
+@register_canonicalize
+@register_stabilize
+@register_specialize
+@local_optimizer([])
+def no_transpose_symmetric(node):
+    if isinstance(node.op, DimShuffle):
+        x = node.inputs[0]
+        if x.type.ndim==2 and is_symmetric(x):
+            #print 'UNDOING TRANSPOSE', is_symmetric(x), x.ndim
+            if node.op.new_order == [1,0]:
+                return [x]
+@register_stabilize
+@local_optimizer([])
+def psd_solve_with_chol(node):
+    if node.op == solve:
+        A, b = node.inputs #result is solution Ax=b
+        if is_psd(A):
+            L = cholesky(A)
+            #N.B. this can be further reduced to a yet-unwritten cho_solve Op
+            #     __if__ no other Op makes use of the the L matrix during the
+            #     stabilization
+            Li_b = Solve('lower_triangular')(L, b)
+            x = Solve('upper_triangular')(L.T, Li_b)
+            return [x]
+@register_stabilize
+@register_specialize
+@local_optimizer([])
+def local_det_chol(node):
+    """
+    If we have det(X) and there is already an L=cholesky(X)
+    floating around, then we can use prod(diag(L)) to get the determinant.
+    """
+    if node.op == det:
+        x, = node.inputs
+        for (cl,xpos) in x.clients:
+            if isinstance(cl.op, Cholesky):
+                L = cl.outputs[0]
+                return [tensor.prod(extract_diag(L)**2)]
+@register_canonicalize
+@register_stabilize
+@register_specialize
+@local_optimizer([])
+def local_log_prod_sqr(node):
+    if node.op == tensor.log:
+        x, = node.inputs
+        if x.owner and isinstance(x.owner.op, tensor.elemwise.Prod):
+            # we cannot always make this substitution because
+            # the prod might include negative terms
+            p = x.owner.inputs[0]
+            print "AAA", p
+            # p is the matrix we're reducing with prod
+            if is_positive(p):
+                return [tensor.log(p).sum(axis=x.owner.op.axis)]
+            #TODO: have a reduction like prod and sum that simply returns the sign
+            #      of the prod multiplication.
+@register_canonicalize
+@register_stabilize
+@register_specialize
+@local_optimizer([])
+def local_log_pow(node):
+    if node.op == tensor.log:
+        x, = node.inputs
+        if x.owner and x.owner.op == tensor.pow:
+            base, exponent = x.owner.inputs
+            #TODO: reason to be careful with dtypes?
+            return [exponent * tensor.log(base)]
+def matrix_dot(*args):
+    rval = args[0]
+    for a in args[1:]:
+        rval = theano.tensor.dot(rval, a)
+    return rval
+MATRIX_STRUCTURES = (
+        'general',
+        'symmetric',
+        'lower_triangular',
+        'upper_triangular',
+        'hermitian',
+        'banded',
+        'diagonal',
+        'toeplitz',
+        )
+class Cholesky(Op):
+    """
+    Return a triangular matrix square root of positive semi-definite `x`
+    L = cholesky(X, lower=True) implies dot(L.T,L)==X
+    """
+    #TODO: inplace
+    #TODO: for specific dtypes
+    def __init__(self, lower=True):
+        self.lower = lower
+        self.destructive = False
+    def props(self):
+        return (self.lower,
+                self.destructive)
+    def __hash__(self):
+        return hash((type(self), self.props()))
+    def __eq__(self, other):
+        return (type(self)==type(other) and self.props() == other.props())
+    def __repr__(self):
+        lu=('lower' if self.lower else 'upper')
+        destr=('destructive' if self.destructive else 'non-destructive')
+        return 'Cholesky{%s,%s}'% (lu,destr)
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        return Apply(self, [x], [x.type()])
+    def perform(self, node, (x,), (z,)):
+        z[0] = scipy.linalg.cholesky(x, lower=self.lower).astype(x.dtype)
+    #def grad(self, (x, y), (gz,)):
+        #return dot(gz, y), dot(x, gz) #no transposing necessary
+cholesky = Cholesky()
+class MatrixInverse(Op):
+    """Compute a matrix inverse"""
+    def __init__(self):
+        pass
+    def props(self):
+        return ()
+    def __hash__(self):
+        return hash((type(self), self.props()))
+    def __eq__(self, other):
+        return (type(self)==type(other) and self.props() == other.props())
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        return Apply(self, [x], [x.type()])
+    def perform(self, node, (x,), (z, )):
+        try:
+            z[0] = numpy.linalg.inv(x).astype(x.dtype)
+        except:
+            print 'Failed to invert', node.inputs[0]
+            raise
+    def grad(self, inputs, g_outputs):
+        x, = inputs
+        xi = self(x)
+        gz, = g_outputs
+        #TT.dot(gz.T,xi)
+        return [-matrix_dot(xi,gz.T,xi).T]
+    def __str__(self):
+        return "MatrixInverse"
+matrix_inverse = MatrixInverse()
+class Solve(Op):
+    """Solve a system of linear equations"""
+    def __init__(self, A_structure='general', lower=False, overwrite_A=False, overwrite_b=False):
+        if A_structure not in MATRIX_STRUCTURES:
+            raise ValueError('Invalid matrix structure argument', A_structure)
+        self.A_structure = A_structure
+        self.lower=lower
+        self.overwrite_A=overwrite_A
+        self.overwrite_b=overwrite_b
+    def props(self):
+        return (self.A_structure,
+                self.lower,
+                self.overwrite_A,
+                self.overwrite_b)
+    def __hash__(self):
+        return hash((type(self),self.props()))
+    def __eq__(self, other):
+        return type(self) == type(other) and self.props() == other.props()
+    def __repr__(self):
+        return 'Solve{%s}'%str(self.props())
+    def make_node(self, A, b):
+        A = as_tensor_variable(A)
+        b = as_tensor_variable(b)
+        return Apply(self, [A,b], [b.type()])
+    def perform(self, node, inputs, output_storage):
+        A, b = inputs
+        #TODO: use the A_structure to go faster
+        output_storage[0][0] = scipy.linalg.solve(A,b)
+solve = Solve() # general solve
+#TODO : SolveTriangular
+#TODO: Optimizations to replace multiplication by matrix inverse with solve() Op (still unwritten)
+class ExtractDiag(Op):
+    def __init__(self, view=False):
+        self.view = view
+        if self.view:
+            self.view_map = {0:[0]}
+            self.perform = self.perform_view
+        else:
+            self.perform = self.perform_noview
+    def __eq__(self, other):
+        return type(self) == type(other) and self.view == other.view
+    def __hash__(self):
+        return hash(type(self))^hash(self.view)
+    def make_node(self, _x):
+        x = as_tensor_variable(_x)
+        if x.type.ndim != 2:
+            raise TypeError('ExtractDiag only works on matrices', _x)
+        return Apply(self, [x], [tensor.vector(dtype=x.type.dtype)])
+    def perform_noview(self, node, (x,), (z,)):
+        #for some reason numpy.diag(x) is really slow
+        N,M = x.shape
+        assert N==M
+        rval = x[0]
+        rval.strides = (x.strides[0]+x.strides[1],)
+        z[0] = rval.copy()
+    def perform_view(self, node, (x,), (z,)):
+        N,M = x.shape
+        a,b = x.strides
+        assert N==M
+        rval = x[0]
+        rval.strides = a+b,
+        z[0] = rval
+    def __str__(self):
+        return 'ExtractDiag{view=%s}'%self.view
+    def grad(self, inputs, g_outputs):
+        return [alloc_diag(g_outputs[0])]
+extract_diag = ExtractDiag()
+#TODO: optimization to insert ExtractDiag with view=True
+class AllocDiag(Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, _x):
+        x = as_tensor_variable(_x)
+        if x.type.ndim != 1:
+            raise TypeError('AllocDiag only works on vectors', _x)
+        return Apply(self, [x], [tensor.matrix(dtype=x.type.dtype)])
+    def perform(self, node, (x,), (z,)):
+        if x.ndim != 1:
+            raise TypeError(x)
+        z[0] = numpy.diag(x)
+alloc_diag = AllocDiag()
+def diag(x):
+    """Numpy-compatibility method
+    For vector `x`, return a zero matrix except for `x` as diagonal.
+    """
+    xx = as_tensor_variable(x)
+    if xx.type.ndim == 1:
+        return alloc_diag(xx)
+    elif xx.type.ndim ==2:
+        return extract_diag(xx)
+    else:
+        raise TypeError('diag requires vector or matrix argument', x)
+class Det(Op):
+    """matrix determinant"""
+    def make_node(self, x):
+        x = as_tensor_variable(x)
+        o = theano.tensor.scalar(dtype=x.dtype)
+        return Apply(self, [x], [o])
+    def perform(self, node, (x,), (z, )):
+        try:
+            z[0] = numpy.asarray(scipy.linalg.det(x), dtype=x.dtype)
+        except:
+            print 'Failed to compute determinant', x
+            raise
+    def grad(self, inputs, g_outputs):
+        gz, = g_outputs
+        x, = inputs
+        return [gz * self(x) * matrix_inverse(x).T]
+    def __str__(self):
+        return "Det"
+det = Det()
+def trace(X):
+    """
+    Returns the sum of diagonal elements of matrix X.
+    """
+    return extract_diag(X).sum()
+def spectral_radius_bound(X, log2_exponent):
+    """
+    Returns upper bound on the largest eigenvalue of square symmetrix matrix X.
+    log2_exponent must be a positive-valued integer. The larger it is, the
+    slower and tighter the bound.  Values up to 5 should usually suffice.  The
+    algorithm works by multiplying X by itself this many times.
+    From V.Pan, 1990. "Estimating the Extremal Eigenvalues of a Symmetric
+    Matrix", Computers Math Applic. Vol 20 n. 2 pp 17-22.
+    """
+    XX = X
+    for i in xrange(log2_exponent):
+        XX = tensor.dot(XX, XX)
+    return tensor.pow(
+            trace(XX),
+            2**(-log2_exponent))
+class A_Xinv_b(Op):
+    """Product of form a inv(X) b"""
+    def make_node(self, a, X, b):
+        a = as_tensor_variable(a)
+        b = as_tensor_variable(b)
+        X = as_tensor_variable(X)
+        o = theano.tensor.matrix(dtype=x.dtype)
+        return Apply(self, [a,X,b], [o])
+    def perform(self, ndoe, inputs, outstor):
+        a,X,b = inputs
+        if 1:
+            L_factor = scipy.linalg.cho_factor(X)
+            xb = scipy.linalg.cho_solve(L_factor, b)
+            xa = scipy.linalg.cho_solve(L_factor, a.T)
+            z = numpy.dot(xa.T, xb)
+        else:
+            raise NotImplementedError(self.X_structure)
+        outstor[0][0]=z
+    def grad(self, inputs, g_outputs):
+        gz, = g_outputs
+        a,X,b = inputs
+        iX = matrix_inverse(X)
+        ga = matrix_dot(gz, b.T, iX.T)
+        gX = -matrix_dot(iX.T, a, gz, b.T, iX.T)
+        gb = matrix_dot(ix.T, a.T, gz)
+        return [ga, gX, gb]
--- a/theano/linalg/tests/test_linalg.py
+++ b/theano/linalg/tests/test_linalg.py
+import numpy
+from theano import tensor, function
+from .ops import *
+if 0:
+    def test_cholesky():
+        #TODO: test upper and lower triangular
+        #todo: unittest randomseed 
+        rng = numpy.random.RandomState(1234)
+        r = rng.randn(5,5)
+        pd = numpy.dot(r,r.T)
+        x = tensor.matrix()
+        chol = Cholesky()(x)
+        f = function([x], tensor.dot(chol, chol.T)) # an optimization could remove this
+        ch_f = function([x], chol)
+        # quick check that chol is upper-triangular
+        ch = ch_f(pd)
+        print ch
+        assert ch[0,4] != 0
+        assert ch[4,0] == 0
+        assert numpy.allclose(numpy.dot(ch.T,ch),pd)
+        assert not numpy.allclose(numpy.dot(ch,ch.T),pd)
+def test_inverse_correctness():
+    #todo: unittest randomseed
+    rng = numpy.random.RandomState(12345)
+    r = rng.randn(4,4)
+    x = tensor.matrix()
+    xi = matrix_inverse(x)
+    ri = function([x], xi)(r)
+    assert ri.shape == r.shape
+    assert ri.dtype == r.dtype
+    rir = numpy.dot(ri,r)
+    rri = numpy.dot(r,ri)
+    assert numpy.allclose(numpy.identity(4), rir), rir
+    assert numpy.allclose(numpy.identity(4), rri), rri
+def test_inverse_grad():
+    rng = numpy.random.RandomState(1234)
+    r = rng.randn(4,4)
+    tensor.verify_grad(matrix_inverse, [r], rng=numpy.random)
+def test_det_grad():
+    rng = numpy.random.RandomState(1234)
+    r = rng.randn(5,5)
+    tensor.verify_grad(det, [r], rng=numpy.random)