提交 94a52915 authored 作者: James Bergstra's avatar James Bergstra

Merge branch 'master' of github.com:Theano/Theano

......@@ -294,6 +294,9 @@ class BadOptimization(DebugModeError):
print >> sio, self.old_graph
print >> sio, " New Graph:"
print >> sio, self.new_graph
print >> sio, ""
print >> sio, "Hint: relax the tolerance by setting tensor.cmp_sloppy=1"
print >> sio, " or even tensor.cmp_sloppy=2 for less-strict comparison"
return sio.getvalue()
class BadDestroyMap(DebugModeError):
......
import logging
logger = logging.getLogger(__name__)
import numpy
from theano.gof import Op, Apply
......@@ -57,6 +60,7 @@ def hints(variable):
@local_optimizer([])
def remove_hint_nodes(node):
if is_hint_node(node):
# transfer hints from graph to Feature
try:
for k,v in node.op.hints:
node.env.hints_feature.add_hint(node.inputs[0], k, v)
......@@ -95,7 +99,7 @@ class HintsFeature(object):
"""
def add_hint(self, r, k, v):
print 'adding hint', r, k, v
logger.debug('adding hint; %s, %s, %s' % (r, k, v))
self.hints[r][k] = v
def ensure_init_r(self, r):
......@@ -171,9 +175,8 @@ def is_positive(v):
return True
#TODO: how to handle this - a registry?
# infer_hints on Ops?
print 'is_positive', v
logger.debug('is_positive: %s' % str(v))
if v.owner and v.owner.op == tensor.pow:
print 'try for pow', v, v.owner.inputs
try:
exponent = tensor.get_constant_value(v.owner.inputs[1])
except TypeError:
......@@ -250,7 +253,6 @@ def local_log_prod_sqr(node):
# we cannot always make this substitution because
# the prod might include negative terms
p = x.owner.inputs[0]
print "AAA", p
# p is the matrix we're reducing with prod
if is_positive(p):
......@@ -316,7 +318,7 @@ class Cholesky(Op):
destr = 'destructive'
else:
destr = 'non-destructive'
return 'Cholesky{%s,%s}'% (lu,destr)
return 'Cholesky{%s,%s}' % (lu, destr)
def make_node(self, x):
x = as_tensor_variable(x)
return Apply(self, [x], [x.type()])
......@@ -378,7 +380,10 @@ class Solve(Op):
def make_node(self, A, b):
A = as_tensor_variable(A)
b = as_tensor_variable(b)
return Apply(self, [A,b], [b.type()])
otype = tensor.tensor(
broadcastable=b.broadcastable,
dtype = (A*b).dtype)
return Apply(self, [A,b], [otype])
def perform(self, node, inputs, output_storage):
A, b = inputs
#TODO: use the A_structure to go faster
......@@ -394,40 +399,46 @@ class ExtractDiag(Op):
self.view = view
if self.view:
self.view_map = {0:[0]}
self.perform = self.perform_view
else:
self.perform = self.perform_noview
def __eq__(self, other):
return type(self) == type(other) and self.view == other.view
def __hash__(self):
return hash(type(self))^hash(self.view)
def make_node(self, _x):
x = as_tensor_variable(_x)
if x.type.ndim != 2:
raise TypeError('ExtractDiag only works on matrices', _x)
return Apply(self, [x], [tensor.vector(dtype=x.type.dtype)])
def perform_noview(self, node, (x,), (z,)):
def perform(self, node, ins, outs):
x, = ins
z, = outs
#for some reason numpy.diag(x) is really slow
N,M = x.shape
assert N==M
rval = x[0]
rval.strides = (x.strides[0]+x.strides[1],)
z[0] = rval.copy()
def perform_view(self, node, (x,), (z,)):
N,M = x.shape
a,b = x.strides
assert N==M
rval = x[0]
rval.strides = a+b,
z[0] = rval
if self.view:
z[0] = rval
else:
z[0] = rval.copy()
def __str__(self):
return 'ExtractDiag{view=%s}'%self.view
def grad(self, inputs, g_outputs):
return [alloc_diag(g_outputs[0])]
extract_diag = ExtractDiag()
def infer_shape(self, node, shapes):
x_s, = shapes
return [(x_s[0],)]
extract_diag = ExtractDiag()
#TODO: optimization to insert ExtractDiag with view=True
class AllocDiag(Op):
def __eq__(self, other):
return type(self) == type(other)
......@@ -448,8 +459,11 @@ alloc_diag = AllocDiag()
def diag(x):
"""Numpy-compatibility method
If `x` is a matrix, return its diagonal.
If `x` is a vector return a matrix with it as its diagonal.
For vector `x`, return a zero matrix except for `x` as diagonal.
* This method does not support the `k` argument that numpy supports.
"""
xx = as_tensor_variable(x)
if xx.type.ndim == 1:
......@@ -461,6 +475,7 @@ def diag(x):
class Det(Op):
"""matrix determinant
TODO: move this op to another file that request scipy.
"""
def make_node(self, x):
......@@ -488,6 +503,7 @@ def trace(X):
"""
return extract_diag(X).sum()
def spectral_radius_bound(X, log2_exponent):
"""
Returns upper bound on the largest eigenvalue of square symmetrix matrix X.
......
from pkg_resources import parse_version as V
import numpy
import theano
from theano import tensor, function
from theano.tensor.basic import _allclose
from theano.tests import unittest_tools as utt
from theano import config
utt.seed_rng()
try:
import scipy
if scipy.__version__ < '0.7':
if V(scipy.__version__) < V('0.7'):
raise ImportError()
use_scipy = True
except ImportError:
......@@ -17,11 +23,12 @@ from theano.sandbox.linalg.ops import (cholesky,
matrix_inverse,
#solve,
#diag,
#extract_diag,
ExtractDiag,
extract_diag,
#alloc_diag,
det,
#PSD_hint,
#trace,
trace,
#spectral_radius_bound
)
......@@ -88,3 +95,61 @@ def test_det_grad():
r = rng.randn(5,5)
tensor.verify_grad(det, [r], rng=numpy.random)
def test_extract_diag():
rng = numpy.random.RandomState(utt.fetch_seed())
x = theano.tensor.matrix()
g = extract_diag(x)
f = theano.function([x], g)
m = rng.rand(3,3).astype(config.floatX)
v = numpy.diag(m)
r = f(m)
# The right diagonal is extracted
assert (r == v).all()
m = rng.rand(2, 3).astype(config.floatX)
ok = False
try:
r = f(m)
except Exception:
ok = True
assert ok
xx = theano.tensor.vector()
ok = False
try:
extract_diag(xx)
except TypeError:
ok = True
assert ok
f = theano.function([x], g.shape)
topo = f.maker.env.toposort()
assert sum([node.op.__class__ == ExtractDiag for node in topo]) == 0
m = rng.rand(3,3).astype(config.floatX)
assert f(m) == 3
# not testing the view=True case since it is not used anywhere.
def test_trace():
rng = numpy.random.RandomState(utt.fetch_seed())
x = theano.tensor.matrix()
g = trace(x)
f = theano.function([x], g)
m = rng.rand(4, 4).astype(config.floatX)
v = numpy.trace(m)
assert v == f(m)
xx = theano.tensor.vector()
ok = False
try:
trace(xx)
except TypeError:
ok = True
assert ok
from pkg_resources import parse_version as V
import sys
try:
import scipy
enable_sparse = scipy.__version__ >= '0.7'
enable_sparse = V(scipy.__version__) >= V('0.7')
if not enable_sparse:
sys.stderr.write("WARNING: scipy version = %s."
" We request version >=0.7.0 for the sparse code as it has"
......
......@@ -333,28 +333,34 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
return ret
class numeric_grad:
"""WRITEME"""
class numeric_grad(object):
"""
Compute the numeric derivative of a scalar-valued function at a particular
point.
"""
# Note on step sizes and tolerances:
#
# There is a relationship between the step size and the function value and the measurement
# error that is incurred due to rounding. The finite difference we measure is
# delta = f(x0) - f(x0+eps)
# There is a relationship between the step size and the function value and
# the measurement error that is incurred due to rounding. The finite
# difference we measure is delta = f(x0) - f(x0+eps)
#
# For maximum precision, f should be close to zero.
# For every power of 2 that f departs from zero, we lose a bit of precision in delta.
# For every power of 2 that f departs from zero, we lose a bit of
# precision in delta.
#
# Even in this case of maximum accuracy, there is a tradeoff between stepsize and
# measurement error.
# Taking small steps allows us to measure large derivatives accuractly, but longer steps
# are required to measure small derivatives accurately. However longer steps introduce
# bias into our measurement in general for non-linear functions.
# Even in this case of maximum accuracy, there is a tradeoff between
# stepsize and measurement error. Taking small steps allows us to measure
# large derivatives accuractly, but longer steps are required to measure
# small derivatives accurately. However longer steps introduce bias into
# our measurement in general for non-linear functions.
#
# It would be interesting to have a version of numeric grad that used an adaptive stepsize.
# It would be interesting to have a version of numeric grad that used an
# adaptive stepsize.
#
# For now, we use a heuristic that catches very bad gradients, but is not perfectly
# accurate.
# For now, we use a heuristic that catches very bad gradients, but is not
# perfectly accurate.
type_eps = {'float64': 1e-7,
'float32': 3e-4,
numpy.dtype('float64'):1e-7,
......@@ -363,6 +369,9 @@ class numeric_grad:
def __init__(self, f, pt, eps=None):
"""Return the gradient of f at pt.
:param f: a differentiable function such that f(*pt) is a scalar
:param pt: an ndarray, a list of ndarrays or tuple of ndarrays
This function computes the gradient by a one-sided finite differences of a
fixed step size (eps).
......@@ -435,39 +444,46 @@ class numeric_grad:
self.gf = self.gf[0]
@staticmethod
def abs_rel_err(a,b):
def abs_rel_err(a, b):
"""Return absolute and relative error between a and b.
The relative error is a small number when a and b are close, relative to how big they are.
The relative error is a small number when a and b are close, relative
to how big they are.
Formulas used:
abs_err = abs(a - b)
rel_err = abs_err / (abs(a) + abs(b))
rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
are both close to 0.
The tuple (abs_err, rel_err) is returned
"""
abs_err = abs(a-b)
rel_err = abs_err / (abs(a) + abs(b))
abs_err = abs(a - b)
rel_err = abs_err / numpy.maximum(abs(a) + abs(b), 1e-8)
return (abs_err, rel_err)
def abs_rel_errors(self, g_pt):
"""Return the abs and rel error of gradient estimate `g_pt`
`g_pt` must be a list of ndarrays of the same length as self.gf, otherwise a ValueError
is raised.
`g_pt` must be a list of ndarrays of the same length as self.gf,
otherwise a ValueError is raised.
Corresponding ndarrays in `g_pt` and `self.gf` must have the same shape or ValueError
is raised.
Corresponding ndarrays in `g_pt` and `self.gf` must have the same
shape or ValueError is raised.
"""
if len(g_pt) != len(self.gf):
raise ValueError('argument has wrong number of elements', len(g_pt))
raise ValueError(
'argument has wrong number of elements',
len(g_pt))
errs = []
for i, (a, b) in enumerate(zip(g_pt, self.gf)):
if a.shape != b.shape:
raise ValueError('argument element %i has wrong shape %s' %(i,str((a.shape,
b.shape))))
errs.append(numeric_grad.abs_rel_err(a,b))
raise ValueError(
'argument element %i has wrong shape %s' % (
i, str((a.shape, b.shape))))
errs.append(numeric_grad.abs_rel_err(a, b))
return errs
def max_err(self, g_pt, abs_tol, rel_tol):
......@@ -477,8 +493,8 @@ class numeric_grad:
wrt the provided tolerances (abs_tol, rel_tol).
A value > 1 means both tolerances are exceeded.
Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over g_pt,
as well as abs_err and rel_err at this point.
Return the argmax of min(abs_err / abs_tol, rel_err / rel_tol) over
g_pt, as well as abs_err and rel_err at this point.
"""
pos = []
errs = []
......@@ -487,7 +503,11 @@ class numeric_grad:
abs_rel_errs = self.abs_rel_errors(g_pt)
for abs_err, rel_err in abs_rel_errs:
scaled_err = numpy.minimum(abs_err/abs_tol, rel_err/rel_tol)
if not numpy.all(numpy.isfinite(abs_err)):
raise ValueError('abs_err not finite', repr(abs_err))
if not numpy.all(numpy.isfinite(rel_err)):
raise ValueError('rel_err not finite', repr(rel_err))
scaled_err = numpy.minimum(abs_err / abs_tol, rel_err / rel_tol)
max_i = scaled_err.argmax()
pos.append(max_i)
......@@ -501,8 +521,8 @@ class numeric_grad:
return (max_arg, pos[max_arg], abs_errs[max_arg], rel_errs[max_arg])
def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=None,
mode=None, cast_to_output_type=False):
def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
rel_tol=None, mode=None, cast_to_output_type=False):
""" Test a gradient by Finite Difference Method. Raise error on failure.
Example:
......@@ -511,9 +531,9 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
rng=numpy.random)
Raises an Exception if the difference between the analytic gradient and
numerical gradient (computed through the Finite Difference Method) of a random
projection of the fun's output to a scalar exceeds
the given tolerance.
numerical gradient (computed through the Finite Difference Method) of a
random projection of the fun's output to a scalar exceeds the given
tolerance.
:param fun: a Python function that takes Theano variables as inputs,
and returns a Theano variable. For instance, an Op instance with
......@@ -521,19 +541,21 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
:param pt: the list of numpy.ndarrays to use as input values.
These arrays must be either float32 or float64 arrays.
:param n_tests: number of times to run the test
:param rng: random number generator used to sample u, we test gradient of sum(u * fun) at pt
:param eps: stepsize used in the Finite Difference Method (Default None is type-dependent)
:param rng: random number generator used to sample u, we test gradient of
sum(u * fun) at pt
:param eps: stepsize used in the Finite Difference Method (Default None is
type-dependent)
:param abs_tol: absolute tolerance used as threshold for gradient comparison
:param rel_tol: relative tolerance used as threshold for gradient comparison
:note: WARNING to unit-test writers: if `op` is a function that builds a graph,
try to make it a SMALL graph. Often verify grad is run in
debug mode, which can be very slow if it has to verify a lot
of intermediate computations.
:note: WARNING to unit-test writers: if `op` is a function that builds a
graph, try to make it a SMALL graph. Often verify grad is run in
debug mode, which can be very slow if it has to verify a lot of
intermediate computations.
:note: This op does not support multiple outputs. In tests/test_scan.py there is
an experimental verify_grad that covers that case as well by using random
projections.
:note: This op does not support multiple outputs. In tests/test_scan.py
there is an experimental verify_grad that covers that case as well by
using random projections.
"""
assert isinstance(pt, (list,tuple))
pt = [numpy.array(p) for p in pt]
......@@ -553,8 +575,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
if rng is None:
raise TypeError('rng should be a valid instance of numpy.random.RandomState.',
'You may want to use theano.tests.unittest_tools.verify_grad instead of theano.tensor.verify_grad.')
raise TypeError('rng be instance of numpy.random.RandomState', (
' hint: Maybe you meant to call'
' theano.tests.unittest_tools.verify_grad instead of'
' theano.tensor.verify_grad.'))
# We allow input downcast in function, because numeric_grad works in the
# most precise dtype used among the inputs, so we may need to cast some.
......@@ -567,15 +591,18 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
allow_input_downcast=True, mode=mode)
return f
tensor_pt = [TensorType(as_tensor_variable(p).dtype, as_tensor_variable(p).broadcastable)(name='input %i'%i) for i,p in enumerate(pt)]
tensor_pt = [TensorType(
as_tensor_variable(p).dtype,
as_tensor_variable(p).broadcastable)(name='input %i'%i)
for i, p in enumerate(pt)]
#fun can be either a function or an actual Op instance
o_output = fun(*tensor_pt)
if isinstance(o_output,list):
raise NotImplementedError('cant (yet) autotest gradient of fun with multiple outputs')
# we could make loop over outputs making random projections R for each,
# but this doesn't handle the case where not all the outputs are
if isinstance(o_output, list):
raise NotImplementedError('verify gradient on multiple outputs')
# we could make loop over outputs making random projections R for
# each, but this doesn't handle the case where not all the outputs are
# differentiable... so I leave this as TODO for now -JB.
o_fn = function(tensor_pt, o_output)
......@@ -597,18 +624,16 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
t_r = shared(random_projection())
#random projection of o onto t_r
cost = theano.tensor.sum(t_r * o_output) #This sum() is defined above, it's not the builtin sum.
cost = theano.tensor.sum(t_r * o_output)
cost_fn = function(tensor_pt, cost)
#todo-- determine if this is actually needed
g_cost = as_tensor_variable(1.0,name='g_cost')
g_cost = as_tensor_variable(1.0, name='g_cost')
if cast_to_output_type:
g_cost = cast(g_cost, o_output.dtype)
symbolic_grad = grad(cost, tensor_pt, g_cost,
disconnected_inputs='ignore')
#if o_output.dtype in ['float32','float64']:
# assert all([x.dtype == o_output.dtype for x in symbolic_grad]),("Expected grad of type %s, got %s "%( symbolic_grad.dtype, o_output.dtyp))
grad_fn = function(tensor_pt, symbolic_grad)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论