提交 d8323502 authored 作者: James Bergstra's avatar James Bergstra

merge + float32 gradient modifications (sorry for combining the two!)

Guillaume and I just found a bug in some experiment code that was
basically caused by confusing semantics of max(). The same sort of
thing applies to min. This is an FYI email to help others on the list
avoid this mistake, which is (I think) easy to make.
Python's max() function takes multiple arguments and returns the
largest one of them. (I won't go into the details of how it deals with
corner cases.)
IN CONTRAST
numpy's max() function takes multiple arguments and returns the
largest element[s] from the *first* argument. The second argument is
used to identify the axis along which to evaluate the [python-style]
max. The third argument is an array into which the result can be
written.
So for example:
.. code-block:: python
>>> max(3, 4)
4
>>> numpy.max(3, 4)
3
>>> a,b,c = [numpy.asarray(i) for i in [0,1,2]]
>>> numpy.max(a,b,c)
0
>>> c
array(0)
Be careful!
Theano defines a max function (called theano.tensor.max) that is
similar numpy's max.
Theano also defines a function called theano.tensor.largest that is
closer to python's, but not identical since it works elemwise for
tensors. There is a corresponding 'smallest' function that is like
min()
......@@ -24,7 +24,19 @@ We describe the details of the compressed sparse matrix types.
is faster if we are modifying the array. After initial inserts,
we can then convert to the appropriate sparse matrix format.
There are four member variables that comprise a compressed matrix ``sp``:
Their is also those type that exist:
``dok_matrix``
Dictionary of Keys format. From their doc: This is an efficient structure for constructing sparse matrices incrementally.
``coo_matrix``
Coordinate format. From their lil doc: consider using the COO format when constructing large matrices.
Their seam new format planed for scipy 0.7.x:
``bsr_matrix``
Block Compressed Row (BSR). From their doc: The Block Compressed Row (BSR) format is very similar to the Compressed Sparse Row (CSR) format. BSR is appropriate for sparse matrices with dense sub matrices like the last example below. Block matrices often arise in vector-valued finite element discretizations. In such cases, BSR is considerably more efficient than CSR and CSC for many sparse arithmetic operations.
``dia_matrix``
Sparse matrix with DIAgonal storage
There are four member variables that comprise a compressed matrix ``sp``(for at least csc, csr and bsr):
``sp.shape``
gives the shape of the matrix.
......
......@@ -15,6 +15,7 @@ import module
from module import *
import debugmode # register DEBUG_MODE
from debugmode import DebugMode
from profilemode import ProfileMode
......@@ -13,7 +13,7 @@ from itertools import chain
from functools import partial
import io, sys
import function_module as F
from mode import default_mode
import mode as get_mode
def name_join(*args):
......@@ -73,13 +73,15 @@ class Component(object):
"""
raise NotImplementedError()
def make_no_init(self, mode=default_mode):
def make_no_init(self, mode=None):
"""
Allocates the necessary containers using allocate() and uses
build() with the provided mode to make an instance which will
be returned. The initialize() method of the instance will not
be called.
"""
if mode is None:
mode = get_mode.default_mode
memo = {}
self.allocate(memo)
rval = self.build(mode, memo)
......@@ -93,7 +95,7 @@ class Component(object):
arguments and the keyword arguments. If 'mode' is in the
keyword arguments it will be passed to build().
"""
mode = kwargs.pop('mode', default_mode)
mode = kwargs.pop('mode', get_mode.default_mode)
rval = self.make_no_init(mode)
if hasattr(rval, 'initialize'):
rval.initialize(*args, **kwargs)
......@@ -919,8 +921,8 @@ class Module(ComponentDict):
"""
InstanceType = ModuleInstance # By default, we use build ModuleInstance
def __init__(self):
super(Module, self).__init__()
def __init__(self, *args, **kw):
super(Module, self).__init__(*args, **kw)
self.__dict__["local_attr"]={}
self.__dict__["_components"]={}
......@@ -1075,7 +1077,7 @@ class Module(ComponentDict):
"""
self.make_module_instance(args,kwargs)
mode = kwargs.pop('mode', default_mode)
mode = kwargs.pop('mode', get_mode.default_mode)
rval = self.make_no_init(mode)
if hasattr(rval, 'initialize'):
rval.initialize(*args, **kwargs)
......
......@@ -581,12 +581,8 @@ class Test_check_isfinite(unittest.TestCase):
x = theano.tensor.dvector()
f = theano.function([x], (x+2) * 5, mode=debugmode.DebugMode(check_isfinite=False))
# the DestroyMap checker should be triggered by Nan != Nan
try:
f(numpy.log([3, -4, 5]))
assert False
except debugmode.BadDestroyMap:
pass
#nan should go through
f(numpy.log([3, -4, 5]))
#inf should go through
infs = numpy.asarray([1.0,1.,1.])/0
......
......@@ -410,7 +410,7 @@ class CSMGrad(gof.op.Op):
return 82345 ^ hash(type(self)) ^ _kmap_hash(self.kmap)
def make_node(self, data, gout_data, gout_indices):
g_data = data.type()
g_data = gout_data.type()
return gof.Apply(self, [data, gout_data, gout_indices], [g_data])
def perform(self, node, (data, gout_data, gout_indices), (g_data,)):
......@@ -1134,6 +1134,7 @@ class StructuredDotCSR(gof.Op):
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;
......
......@@ -161,17 +161,11 @@ class test_structureddot(unittest.TestCase):
def test_structuredot(self):
bsize = 2
typenames = 'int8', 'int32', 'int16', 'int64', 'float32', 'float64', 'complex64', 'complex128'
typenames = 'float32', 'int64', 'int8', 'int32', 'int16', 'float64', 'complex64', 'complex128'
for sparse_dtype in typenames:
for dense_dtype in typenames:
output_dtype = theano.scalar.upcast(sparse_dtype, dense_dtype)
#print 'output_dtype = ', output_dtype
#print '** sparse_dtype = ', sparse_dtype
#print '** dense_dtype = ', dense_dtype
for dense_dtype in typenames:
for sparse_dtype in typenames:
print >> sys.stderr, dense_dtype, sparse_dtype
# iterate for a few different random graph patterns
for i in range(10):
spmat = sp.csc_matrix((4,6), dtype=sparse_dtype)
......@@ -182,11 +176,10 @@ class test_structureddot(unittest.TestCase):
spmat[x,y] = numpy.random.rand()*10
spmat = sp.csc_matrix(spmat)
kerns = tensor.Tensor(sparse_dtype, broadcastable=[False])('kerns')
images = tensor.Tensor(dense_dtype, broadcastable=[False, False])('images')
#print 'kerns.dtype = ', kerns.dtype
#print 'images.dtype = ', images.dtype
kerns = tensor.Tensor(broadcastable=[False], dtype=sparse_dtype)('kerns')
images = tensor.Tensor(broadcastable=[False, False], dtype=dense_dtype)('images')
output_dtype = theano.scalar.upcast(sparse_dtype, dense_dtype)
##
# Test compressed-sparse column matrices ###
##
......@@ -195,60 +188,64 @@ class test_structureddot(unittest.TestCase):
def buildgraphCSC(kerns,images):
csc = CSC(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
assert csc.type.dtype == sparse_dtype
return structured_dot(csc, images.T)
rval = structured_dot(csc, images.T)
assert rval.type.dtype == output_dtype
return rval
out = buildgraphCSC(kerns,images)
f = theano.function([kerns,images], out)
# compute theano outputs
#print 'spmat.data', spmat.data.dtype.num
kernvals = numpy.array(spmat.data[:spmat.size])
#print 'kdtype', kernvals.dtype, kernvals.shape, kernvals.ndim, kernvals.dtype.num
#print 'type of kernvals = ', kernvals.dtype
imvals = 1.0 * numpy.array(numpy.arange(bsize*spmat.shape[1]).\
kernvals = spmat.data[:spmat.size]
imvals = 1.0 + 1.0 * numpy.array(numpy.arange(bsize*spmat.shape[1]).\
reshape(bsize,spmat.shape[1]), dtype=dense_dtype)
print 'kerntype', str(kernvals.dtype), kernvals.dtype.num
outvals = f(kernvals,imvals)
print 'YAY'
print spmat.todense()
print imvals.T
print "OUT1", outvals
# compare to scipy
c = spmat * (imvals.T)
assert _is_dense(c)
assert str(outvals.dtype) == output_dtype
assert numpy.all(numpy.abs(outvals -
numpy.array(c, dtype=output_dtype)) < 1e-4)
if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
utt.verify_grad(buildgraphCSC,
[kernvals, imvals])
print 'BBB'
##
# Test compressed-sparse row matrices ###
##
spmat = spmat.tocsr()
# build theano graph
def buildgraphCSR(kerns,images):
csr = CSR(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
return structured_dot(csr, images.T)
out = buildgraphCSR(kerns,images)
f = theano.function([kerns,images], out)
# compute theano output
kernvals[:] = spmat.data[:spmat.size]
#kernvals = numpy.empty(spmat.size, dtype=dense_dtype)
imvals = 1.0 * numpy.arange(bsize*spmat.shape[1]).reshape(bsize,spmat.shape[1])
print 'kerntype2', str(kernvals.dtype), kernvals.dtype.num
outvals = f(kernvals,imvals)
print 'YAYAGI'
# compare to scipy
c = spmat * (imvals.T)
assert _is_dense(c)
assert str(outvals.dtype) == output_dtype
assert numpy.all(numpy.abs(outvals -
numpy.array(c, dtype=output_dtype)) < 1e-4)
#if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
#utt.verify_grad(buildgraphCSC, [kernvals,imvals])
def notest(self):
##
# Test compressed-sparse row matrices ###
##
spmat = spmat.tocsr()
# build theano graph
def buildgraphCSR(kerns,images):
csr = CSR(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
assert csr.type.dtype == sparse_dtype
return structured_dot(csr, images.T)
out = buildgraphCSR(kerns,images)
f = theano.function([kerns,images], out)
# compute theano output
kernvals = spmat.data[:spmat.size]
imvals = 1.0 * numpy.arange(bsize*spmat.shape[1]).reshape(bsize,spmat.shape[1])
outvals = f(kernvals,imvals)
# compare to scipy
c = spmat * (imvals.T)
assert _is_dense(c)
assert str(outvals.dtype) == output_dtype
if not numpy.all(numpy.abs(outvals - numpy.array(c, dtype=output_dtype)) < 1e-5):
print numpy.abs(outvals - numpy.array(c, dtype=output_dtype))
assert numpy.all(numpy.abs(outvals -
numpy.array(c, dtype=output_dtype)) < 1e-4)
# we could test more, but hopefully this suffices?
if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
utt.verify_grad( buildgraphCSR, [kernvals,imvals])
# we could test more, but hopefully this suffices?
if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
utt.verify_grad( buildgraphCSR, [kernvals,imvals])
def test_opt_unpack(self):
kerns = tensor.Tensor(dtype='int64', broadcastable=[False])('kerns')
......
......@@ -5,6 +5,7 @@ from basic import *
import opt
import blas
import xlogx
import raw_random, randomstreams
from randomstreams import \
......
......@@ -264,13 +264,43 @@ class TensorType(Type):
if 'int' in str(a.dtype):
return numpy.all(a==b)
elif a.shape == (): #for comparing scalars, use broadcasting.
# Note: according to James B, there was a reason for the
# following two lines, that may seem weird at first glance.
# If someone can figure out what it is, please say it here!
ones = numpy.ones(2)
return numpy.allclose(ones * a, ones*b)
#elif str(a.dtype).startswith('complex'):
# print >> sys.stderr, 'WARNING: skipping comparison of complex'
# return True
else:
return numpy.allclose(a,b)
cmp = numpy.allclose(a,b)
if cmp:
# Numpy claims they are close, this is good enough for us.
return True
# Numpy is unhappy, but it does not necessarily mean that a and
# b are different. Indeed, Numpy does not like missing values
# and will return False whenever some are found in a or b.
# The proper way would be to use the MaskArray stuff available
# in Numpy. However, it looks like it has been added to Numpy's
# core recently, so it may not be available to everyone. Thus,
# for now we use a home-made recipe, that should probably be
# revisited in the future.
a_missing = numpy.isnan(a)
if not a_missing.any():
# There are no missing values in a, thus this is not the
# reason why numpy.allclose(a, b) returned False.
return False
# The following line is what numpy.allclose bases its decision
# upon, according to its documentation.
rtol = 1.0000000000000001e-05
atol = 1e-8
cmp_elemwise = (numpy.absolute(a - b) <=
(atol + rtol * numpy.absolute(b)))
# Find places where both a and b have missing values.
both_missing = a_missing * numpy.isnan(b)
# Combine all information.
return (cmp_elemwise + both_missing).all()
return False
def __hash__(self):
......@@ -967,6 +997,11 @@ def smallest(*args):
"""Return the [elementwise] smallest of a variable number of arguments (like python's min)."""
return min(stack(*args), axis=0)
@constructor
def largest(*args):
"""Return the [elementwise] largest of a variable number of arguments (like python's max)."""
return max(stack(*args), axis=0)
##########################
# Comparison
......@@ -2355,7 +2390,10 @@ def grad(cost, wrt, g_cost=None, consider_constant=[]):
class numeric_grad:
"""WRITEME"""
def __init__(self, f, pt, eps=1.0e-7):
type_eps = {'float64': 1e-7,
'float32': 3e-3}
def __init__(self, f, pt, eps=None):
"""Return the gradient of f at pt.
This function computes the gradient by a one-sided finite differences of a
......@@ -2363,6 +2401,9 @@ class numeric_grad:
It is assumed that f(...) will return a scalar.
It is assumed that all f's inputs are numpy.ndarray objects.
:param eps: the stepsize for the finite differencing. None means input
dtype-dependent. See `type_eps`.
"""
def prod(inputs):
......@@ -2388,9 +2429,15 @@ class numeric_grad:
total_size = __builtin__.sum(prod(sh) for sh in shapes)
working_dtype = __builtin__.min((self.type_eps[dt], dt) for dt in dtypes)[1]
#create un-initialized memory
x = numpy.ndarray((total_size,), dtype=dtypes[0])
gx = numpy.ndarray((total_size,), dtype=dtypes[0])
x = numpy.ndarray((total_size,), dtype=working_dtype)
gx = numpy.ndarray((total_size,), dtype=working_dtype)
if eps is None:
eps = __builtin__.max(self.type_eps[dt] for dt in dtypes)
#set up aliases so that apt[i] is backed by memory in x
# and self.gf is backed by memory in gx
......@@ -2430,10 +2477,10 @@ class numeric_grad:
errs = []
for a, b in zip(g_pt, self.gf):
errs.append(numpy.max(numeric_grad.abs_rel_err(a,b)))
return numpy.max(errs)
return numpy.max(errs), numpy.argmax(errs)
def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
def verify_grad(op, pt, n_tests=2, rng=None, eps=None, tol=None):
""" WRITEME
Raises an Exception if the difference between the analytic gradient and
......@@ -2445,12 +2492,19 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
:param pt: the list of numpy.ndarrays to use as inputs to the op
:param n_tests: number of times to run the test
:param rng: random number generator from which to draw random samples
:param eps: stepsize used in the Finite Difference Method
:param eps: stepsize used in the Finite Difference Method (Default None is type-dependent)
:param tol: relative tolerance used as threshold for gradient comparison
"""
pt = [numpy.array(p) for p in pt]
_type_tol = dict( # relativ error tolerances for different types
float32=1e-2,
float64=1e-4)
if tol is None:
tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
if rng is None:
rng = numpy.random
unittest_tools.seed_rng()
......@@ -2493,9 +2547,9 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
if not isinstance(analytic_grad, (list, tuple)):
analytic_grad = [analytic_grad]
max_err = num_grad.max_err(analytic_grad)
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > tol:
raise Exception(verify_grad.E_grad, (max_err, tol))
raise Exception(verify_grad.E_grad, (max_err, tol, max_err_pos))
verify_grad.E_grad = 'gradient error exceeded tolerance'
"""This error is raised when a gradient is calculated, but incorrect."""
......@@ -848,7 +848,13 @@ def binary_crossentropy(output, target):
return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
def categorical_crossentropy(coding_dist, true_dist, axis=1):
"""Return the cross-entropy between an approximating distribution and a true distribution
"""
WARNING: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
We ultimately don't want the polymorphism, and will move this function to pylearn.algorithms.cost.
The 1hot version will be removed.
The length of the documentation here is a form of code smell.
Return the cross-entropy between an approximating distribution and a true distribution
The cross entropy between two probability distributions measures the average number of bits
needed to identify an event from a set of possibilities, if a coding scheme is used based
......@@ -876,6 +882,7 @@ def categorical_crossentropy(coding_dist, true_dist, axis=1):
:returns: the cross entropy between each coding and true distribution.
"""
assert true_dist.ndim in (1,2)
if true_dist.ndim == 2:
return -theano.sum(true_dist * log(coding_dist), axis=axis)
else:
......
import theano
from theano import tensor, scalar
import numpy
from elemwise import Elemwise
from theano import scalar
class XlogX(scalar.UnaryScalarOp):
"""
Compute X * log(X), with special case 0 log(0) = 0.
......@@ -24,7 +26,7 @@ class XlogX(scalar.UnaryScalarOp):
: %(x)s * log(%(x)s);""" % locals()
raise NotImplementedError('only floatingpoint is implemented')
scalar_xlogx = XlogX(scalar.upgrade_to_float, name='scalar_xlogx')
xlogx = tensor.Elemwise(scalar_xlogx, name='xlogx')
xlogx = Elemwise(scalar_xlogx, name='xlogx')
class XlogY0(scalar.BinaryScalarOp):
......@@ -48,5 +50,4 @@ class XlogY0(scalar.BinaryScalarOp):
: %(x)s * log(%(y)s);""" % locals()
raise NotImplementedError('only floatingpoint is implemented')
scalar_xlogy0 = XlogY0(scalar.upgrade_to_float, name='scalar_xlogy0')
xlogy0 = tensor.Elemwise(scalar_xlogy0, name='xlogy0')
xlogy0 = Elemwise(scalar_xlogy0, name='xlogy0')
......@@ -42,7 +42,7 @@ def seed_rng(pseed=None):
numpy.random.seed(seed)
return seed
def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
"""
Wrapper for tensor/basic.py:verify_grad
Takes care of seeding the random number generator if None is given
......@@ -50,5 +50,5 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
if rng is None:
seed_rng()
rng = numpy.random
T.verify_grad(op, pt, n_tests, rng, eps, tol)
T.verify_grad(op, pt, n_tests, rng, *args, **kwargs)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论