merge + float32 gradient modifications (sorry for combining the two!)

d8323502 · James Bergstra · 3f031546 · c1933179 · d8323502 · d8323502
--- a/doc/sandbox/max_gotcha.txt
+++ b/doc/sandbox/max_gotcha.txt
+Guillaume and I just found a bug in some experiment code that was
+basically caused by confusing semantics of max().  The same sort of
+thing applies to min.  This is an FYI email to help others on the list
+avoid this mistake, which is (I think) easy to make.
+
+Python's max() function takes multiple arguments and returns the
+largest one of them. (I won't go into the details of how it deals with
+corner cases.)
+
+IN CONTRAST
+
+numpy's max() function takes multiple arguments and returns the
+largest element[s] from the *first* argument.  The second argument is
+used to identify the axis along which to evaluate the [python-style]
+max.  The third argument is an array into which the result can be
+written.
+
+So for example:
+.. code-block:: python
+
+    >>> max(3, 4)
+    4
+    >>> numpy.max(3, 4)
+    3
+    >>> a,b,c = [numpy.asarray(i) for i in [0,1,2]]
+    >>> numpy.max(a,b,c)
+    0
+    >>> c
+    array(0)
+
+Be careful!
+
+Theano defines a max function (called theano.tensor.max) that is
+similar numpy's max.
+
+Theano also defines a function called theano.tensor.largest that is
+closer to python's, but not identical since it works elemwise for
+tensors.  There is a corresponding 'smallest' function that is like
+min()
--- a/doc/sandbox/sparse.txt
+++ b/doc/sandbox/sparse.txt
@@ -24,7 +24,19 @@ We describe the details of the compressed sparse matrix types.
        is faster if we are modifying the array. After initial inserts,
        we can then convert to the appropriate sparse matrix format.

-There are four member variables that comprise a compressed matrix ``sp``:
+Their is also those type that exist:
+    ``dok_matrix``
+        Dictionary of Keys format. From their doc: This is an efficient structure for constructing sparse matrices incrementally.
+    ``coo_matrix``
+        Coordinate format. From their lil doc: consider using the COO format when constructing large matrices.
+
+Their seam new format planed for scipy 0.7.x:
+    ``bsr_matrix``
+        Block Compressed Row (BSR). From their doc: The Block Compressed Row (BSR) format is very similar to the Compressed Sparse Row (CSR) format. BSR is appropriate for sparse matrices with dense sub matrices like the last example below. Block matrices often arise in vector-valued finite element discretizations. In such cases, BSR is considerably more efficient than CSR and CSC for many sparse arithmetic operations.
+    ``dia_matrix``
+	Sparse matrix with DIAgonal storage
+
+There are four member variables that comprise a compressed matrix ``sp``(for at least csc, csr and bsr):

    ``sp.shape``
        gives the shape of the matrix.

--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -15,6 +15,7 @@ import module
 from module import *

 import debugmode   # register DEBUG_MODE
+from debugmode import DebugMode

 from profilemode import ProfileMode

--- a/theano/compile/module.py
+++ b/theano/compile/module.py
@@ -13,7 +13,7 @@ from itertools import chain
 from functools import partial
 import io, sys
 import function_module as F
-from mode import default_mode
+import mode as get_mode


 def name_join(*args):
@@ -73,13 +73,15 @@ class Component(object):
        """
        raise NotImplementedError()

-    def make_no_init(self, mode=default_mode):
+    def make_no_init(self, mode=None):
        """
        Allocates the necessary containers using allocate() and uses
        build() with the provided mode to make an instance which will
        be returned.  The initialize() method of the instance will not
        be called.
        """
+        if mode is None:
+            mode = get_mode.default_mode
        memo = {}
        self.allocate(memo)
        rval = self.build(mode, memo)
@@ -93,7 +95,7 @@ class Component(object):
        arguments and the keyword arguments. If 'mode' is in the
        keyword arguments it will be passed to build().
        """
-        mode = kwargs.pop('mode', default_mode)
+        mode = kwargs.pop('mode', get_mode.default_mode)
        rval = self.make_no_init(mode)
        if hasattr(rval, 'initialize'):
            rval.initialize(*args, **kwargs)
@@ -919,8 +921,8 @@ class Module(ComponentDict):
    """
    InstanceType = ModuleInstance # By default, we use build ModuleInstance
   
-    def __init__(self):
-        super(Module, self).__init__()
+    def __init__(self, *args, **kw):
+        super(Module, self).__init__(*args, **kw)
        self.__dict__["local_attr"]={}
        self.__dict__["_components"]={}

@@ -1075,7 +1077,7 @@ class Module(ComponentDict):
        """
        self.make_module_instance(args,kwargs)

-        mode = kwargs.pop('mode', default_mode)
+        mode = kwargs.pop('mode', get_mode.default_mode)
        rval = self.make_no_init(mode)
        if hasattr(rval, 'initialize'):
            rval.initialize(*args, **kwargs)

--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -581,12 +581,8 @@ class Test_check_isfinite(unittest.TestCase):
        x = theano.tensor.dvector()
        f = theano.function([x], (x+2) * 5, mode=debugmode.DebugMode(check_isfinite=False))

-        # the DestroyMap checker should be triggered by Nan != Nan
-        try:
-            f(numpy.log([3, -4, 5]))
-            assert False
-        except debugmode.BadDestroyMap:
-            pass
+        #nan should go through
+        f(numpy.log([3, -4, 5]))

        #inf should go through
        infs = numpy.asarray([1.0,1.,1.])/0

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -410,7 +410,7 @@ class CSMGrad(gof.op.Op):
        return 82345 ^ hash(type(self)) ^ _kmap_hash(self.kmap)

    def make_node(self, data, gout_data, gout_indices):
-        g_data = data.type()
+        g_data = gout_data.type()
        return gof.Apply(self, [data, gout_data, gout_indices], [g_data])

    def perform(self, node, (data, gout_data, gout_indices), (g_data,)):
@@ -1134,6 +1134,7 @@ class StructuredDotCSR(gof.Op):

            // pointers to access actual data in the arrays passed as params.
            dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
+            const dtype_%(a_val)s* __restrict__ Dval = (dtype_%(a_val)s*)%(a_val)s->data;
            const npy_int32 * __restrict__ Dind = (npy_int32*)%(a_ind)s->data;
            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(a_ptr)s->data;


--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -161,17 +161,11 @@ class test_structureddot(unittest.TestCase):

    def test_structuredot(self):
        bsize = 2
-        typenames = 'int8', 'int32', 'int16', 'int64', 'float32', 'float64', 'complex64', 'complex128'
+        typenames = 'float32', 'int64', 'int8', 'int32', 'int16', 'float64', 'complex64', 'complex128'
       
-        for sparse_dtype in typenames:
-            for dense_dtype in typenames:
-
-                output_dtype = theano.scalar.upcast(sparse_dtype, dense_dtype)
-                #print 'output_dtype = ', output_dtype
-
-                #print '** sparse_dtype = ', sparse_dtype
-                #print '** dense_dtype = ', dense_dtype
-
+        for dense_dtype in typenames:
+            for sparse_dtype in typenames:
+                print >> sys.stderr, dense_dtype, sparse_dtype
                # iterate for a few different random graph patterns
                for i in range(10):
                    spmat = sp.csc_matrix((4,6), dtype=sparse_dtype)
@@ -182,11 +176,10 @@ class test_structureddot(unittest.TestCase):
                        spmat[x,y] = numpy.random.rand()*10
                    spmat = sp.csc_matrix(spmat)
               
-                    kerns = tensor.Tensor(sparse_dtype, broadcastable=[False])('kerns')
-                    images = tensor.Tensor(dense_dtype, broadcastable=[False, False])('images')
-                    #print 'kerns.dtype = ', kerns.dtype
-                    #print 'images.dtype = ', images.dtype
-                    
+                    kerns = tensor.Tensor(broadcastable=[False], dtype=sparse_dtype)('kerns')
+                    images = tensor.Tensor(broadcastable=[False, False], dtype=dense_dtype)('images')
+
+                    output_dtype = theano.scalar.upcast(sparse_dtype, dense_dtype)
                    ##
                    # Test compressed-sparse column matrices ###
                    ##
@@ -195,60 +188,64 @@ class test_structureddot(unittest.TestCase):
                    def buildgraphCSC(kerns,images):
                        csc = CSC(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
                        assert csc.type.dtype == sparse_dtype
-                        return structured_dot(csc, images.T)
+                        rval = structured_dot(csc, images.T)
+                        assert rval.type.dtype == output_dtype
+                        return rval

                    out = buildgraphCSC(kerns,images)
                    f = theano.function([kerns,images], out)

                    # compute theano outputs
-                    #print 'spmat.data', spmat.data.dtype.num
-                    kernvals = numpy.array(spmat.data[:spmat.size])
-                    #print 'kdtype', kernvals.dtype, kernvals.shape, kernvals.ndim, kernvals.dtype.num
-                    #print 'type of kernvals = ', kernvals.dtype
-                    imvals = 1.0 * numpy.array(numpy.arange(bsize*spmat.shape[1]).\
+                    kernvals = spmat.data[:spmat.size]
+                    imvals = 1.0 + 1.0 * numpy.array(numpy.arange(bsize*spmat.shape[1]).\
                            reshape(bsize,spmat.shape[1]), dtype=dense_dtype)
+                    print 'kerntype', str(kernvals.dtype), kernvals.dtype.num
                    outvals = f(kernvals,imvals)
-
+                    print 'YAY'
+                    print spmat.todense()
+                    print imvals.T
+                    print "OUT1", outvals
                    # compare to scipy
                    c = spmat * (imvals.T)
                    assert _is_dense(c)
                    assert str(outvals.dtype) == output_dtype
+                    assert numpy.all(numpy.abs(outvals - 
+                                     numpy.array(c, dtype=output_dtype)) < 1e-4)
+
+                    if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
+                        utt.verify_grad(buildgraphCSC, 
+                                [kernvals, imvals])
+
+                    print 'BBB'

+                    ##
+                    # Test compressed-sparse row matrices ###
+                    ##
+                    spmat = spmat.tocsr()
+                    
+                    # build theano graph
+                    def buildgraphCSR(kerns,images):
+                        csr = CSR(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
+                        return structured_dot(csr, images.T)
+                    out = buildgraphCSR(kerns,images)
+                    f = theano.function([kerns,images], out)
+                    # compute theano output
+                    kernvals[:] = spmat.data[:spmat.size]
+                    #kernvals = numpy.empty(spmat.size, dtype=dense_dtype)
+                    imvals = 1.0 * numpy.arange(bsize*spmat.shape[1]).reshape(bsize,spmat.shape[1])
+                    print 'kerntype2', str(kernvals.dtype), kernvals.dtype.num
+                    outvals = f(kernvals,imvals)
+                    print 'YAYAGI'
+                    # compare to scipy
+                    c = spmat * (imvals.T)
+                    assert _is_dense(c)
+                    assert str(outvals.dtype) == output_dtype
                    assert numpy.all(numpy.abs(outvals - 
                                     numpy.array(c, dtype=output_dtype)) < 1e-4)

-                    #if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
-                        #utt.verify_grad(buildgraphCSC, [kernvals,imvals])
-
-    def notest(self):
-        ##
-        # Test compressed-sparse row matrices ###
-        ##
-        spmat = spmat.tocsr()
-        
-        # build theano graph
-        def buildgraphCSR(kerns,images):
-            csr = CSR(kerns, spmat.indices[:spmat.size], spmat.indptr, spmat.shape)
-            assert csr.type.dtype == sparse_dtype
-            return structured_dot(csr, images.T)
-        out = buildgraphCSR(kerns,images)
-        f = theano.function([kerns,images], out)
-        # compute theano output
-        kernvals = spmat.data[:spmat.size]
-        imvals = 1.0 * numpy.arange(bsize*spmat.shape[1]).reshape(bsize,spmat.shape[1])
-        outvals = f(kernvals,imvals)
-        # compare to scipy
-        c = spmat * (imvals.T)
-        assert _is_dense(c)
-        assert str(outvals.dtype) == output_dtype
-        if not numpy.all(numpy.abs(outvals - numpy.array(c, dtype=output_dtype)) < 1e-5):
-            print numpy.abs(outvals - numpy.array(c, dtype=output_dtype))
-        assert numpy.all(numpy.abs(outvals - 
-            numpy.array(c, dtype=output_dtype)) < 1e-4)
-
-        # we could test more, but hopefully this suffices?
-        if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
-            utt.verify_grad( buildgraphCSR, [kernvals,imvals])
+                    # we could test more, but hopefully this suffices?
+                    if sparse_dtype.startswith('float') and dense_dtype.startswith('float'):
+                        utt.verify_grad( buildgraphCSR, [kernvals,imvals])

    def test_opt_unpack(self):
        kerns = tensor.Tensor(dtype='int64', broadcastable=[False])('kerns')

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -5,6 +5,7 @@ from basic import *

 import opt
 import blas
+import xlogx

 import raw_random, randomstreams
 from randomstreams import \

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -264,13 +264,43 @@ class TensorType(Type):
            if 'int' in str(a.dtype):
                return numpy.all(a==b)
            elif a.shape == (): #for comparing scalars, use broadcasting.
+                # Note: according to James B, there was a reason for the
+                # following two lines, that may seem weird at first glance.
+                # If someone can figure out what it is, please say it here!
                ones = numpy.ones(2)
                return numpy.allclose(ones * a, ones*b)
            #elif str(a.dtype).startswith('complex'):
            #    print >> sys.stderr, 'WARNING: skipping comparison of complex'
            #    return True
            else:
-                return numpy.allclose(a,b)
+                cmp = numpy.allclose(a,b)
+                if cmp:
+                    # Numpy claims they are close, this is good enough for us.
+                    return True
+                # Numpy is unhappy, but it does not necessarily mean that a and
+                # b are different. Indeed, Numpy does not like missing values
+                # and will return False whenever some are found in a or b.
+                # The proper way would be to use the MaskArray stuff available
+                # in Numpy. However, it looks like it has been added to Numpy's
+                # core recently, so it may not be available to everyone. Thus,
+                # for now we use a home-made recipe, that should probably be
+                # revisited in the future.
+                a_missing = numpy.isnan(a)
+                if not a_missing.any():
+                    # There are no missing values in a, thus this is not the
+                    # reason why numpy.allclose(a, b) returned False.
+                    return False
+                # The following line is what numpy.allclose bases its decision
+                # upon, according to its documentation.
+                rtol = 1.0000000000000001e-05
+                atol = 1e-8
+                cmp_elemwise = (numpy.absolute(a - b) <=
+                        (atol + rtol * numpy.absolute(b)))
+                # Find places where both a and b have missing values.
+                both_missing = a_missing * numpy.isnan(b)
+                # Combine all information.
+                return (cmp_elemwise + both_missing).all()
+
        return False

    def __hash__(self):
@@ -967,6 +997,11 @@ def smallest(*args):
    """Return the [elementwise] smallest of a variable number of arguments (like python's min)."""
    return min(stack(*args), axis=0)

+@constructor
+def largest(*args):
+    """Return the [elementwise] largest of a variable number of arguments (like python's max)."""
+    return max(stack(*args), axis=0)
+

 ##########################
 # Comparison
@@ -2355,7 +2390,10 @@ def grad(cost, wrt, g_cost=None, consider_constant=[]):

 class numeric_grad:
    """WRITEME"""
-    def __init__(self, f, pt, eps=1.0e-7):
+    type_eps = {'float64': 1e-7,
+            'float32': 3e-3}
+
+    def __init__(self, f, pt, eps=None):
        """Return the gradient of f at pt.
        
        This function computes the gradient by a one-sided finite differences of a
@@ -2363,6 +2401,9 @@ class numeric_grad:
        
        It is assumed that f(...) will return a scalar.
        It is assumed that all f's inputs are numpy.ndarray objects.
+
+        :param eps: the stepsize for the finite differencing.  None means input
+        dtype-dependent. See `type_eps`.
        """

        def prod(inputs):
@@ -2388,9 +2429,15 @@ class numeric_grad:

        total_size = __builtin__.sum(prod(sh) for sh in shapes)

+        working_dtype = __builtin__.min((self.type_eps[dt], dt) for dt in dtypes)[1]
+
        #create un-initialized memory
-        x = numpy.ndarray((total_size,), dtype=dtypes[0])
-        gx = numpy.ndarray((total_size,), dtype=dtypes[0])
+        x = numpy.ndarray((total_size,), dtype=working_dtype)
+        gx = numpy.ndarray((total_size,), dtype=working_dtype)
+
+        if eps is None:
+            eps = __builtin__.max(self.type_eps[dt] for dt in dtypes)
+

        #set up aliases so that apt[i] is backed by memory in x
        # and self.gf is backed by memory in gx
@@ -2430,10 +2477,10 @@ class numeric_grad:
        errs = []
        for a, b in zip(g_pt, self.gf):
            errs.append(numpy.max(numeric_grad.abs_rel_err(a,b)))
-        return numpy.max(errs)
+        return numpy.max(errs), numpy.argmax(errs)


-def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
+def verify_grad(op, pt, n_tests=2, rng=None, eps=None, tol=None):
    """ WRITEME
    
    Raises an Exception if the difference between the analytic gradient and
@@ -2445,12 +2492,19 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
    :param pt: the list of numpy.ndarrays to use as inputs to the op
    :param n_tests: number of times to run the test
    :param rng: random number generator from which to draw random samples
-    :param eps: stepsize used in the Finite Difference Method
+    :param eps: stepsize used in the Finite Difference Method (Default None is type-dependent)
    :param tol: relative tolerance used as threshold for gradient comparison
    
    """
    pt = [numpy.array(p) for p in pt]

+    _type_tol = dict( # relativ error tolerances for different types
+            float32=1e-2,
+            float64=1e-4)
+
+    if tol is None:
+        tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
+
    if rng is None:
        rng = numpy.random
        unittest_tools.seed_rng()
@@ -2493,9 +2547,9 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
        if not isinstance(analytic_grad, (list, tuple)):
            analytic_grad = [analytic_grad]

-        max_err = num_grad.max_err(analytic_grad)
+        max_err, max_err_pos = num_grad.max_err(analytic_grad)
        if  max_err > tol:
-            raise Exception(verify_grad.E_grad, (max_err, tol))
+            raise Exception(verify_grad.E_grad, (max_err, tol, max_err_pos))

 verify_grad.E_grad = 'gradient error exceeded tolerance'
 """This error is raised when a gradient is calculated, but incorrect."""
--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -848,7 +848,13 @@ def binary_crossentropy(output, target):
    return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))

 def categorical_crossentropy(coding_dist, true_dist, axis=1):
-    """Return the cross-entropy between an approximating distribution and a true distribution
+    """
+    WARNING: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
+    We ultimately don't want the polymorphism, and will move this function to pylearn.algorithms.cost.
+    The 1hot version will be removed.
+    The length of the documentation here is a form of code smell.
+    
+    Return the cross-entropy between an approximating distribution and a true distribution

    The cross entropy between two probability distributions measures the average number of bits
    needed to identify an event from a set of possibilities, if a coding scheme is used based
@@ -876,6 +882,7 @@ def categorical_crossentropy(coding_dist, true_dist, axis=1):
    :returns: the cross entropy between each coding and true distribution.

    """
+    assert true_dist.ndim in (1,2)
    if true_dist.ndim == 2:
        return -theano.sum(true_dist * log(coding_dist), axis=axis)
    else:

--- a/theano/tensor/xlogx.py
+++ b/theano/tensor/xlogx.py

 import theano
-from theano import tensor, scalar
 import numpy

+from elemwise import Elemwise
+from theano import scalar
+
 class XlogX(scalar.UnaryScalarOp):
    """
    Compute X * log(X), with special case 0 log(0) = 0.
@@ -24,7 +26,7 @@ class XlogX(scalar.UnaryScalarOp):
                : %(x)s * log(%(x)s);""" % locals()
        raise NotImplementedError('only floatingpoint is implemented')
 scalar_xlogx  = XlogX(scalar.upgrade_to_float, name='scalar_xlogx')
-xlogx = tensor.Elemwise(scalar_xlogx, name='xlogx')
+xlogx = Elemwise(scalar_xlogx, name='xlogx')


 class XlogY0(scalar.BinaryScalarOp):
@@ -48,5 +50,4 @@ class XlogY0(scalar.BinaryScalarOp):
                : %(x)s * log(%(y)s);""" % locals()
        raise NotImplementedError('only floatingpoint is implemented')
 scalar_xlogy0  = XlogY0(scalar.upgrade_to_float, name='scalar_xlogy0')
-xlogy0 = tensor.Elemwise(scalar_xlogy0, name='xlogy0')
-
+xlogy0 = Elemwise(scalar_xlogy0, name='xlogy0')
--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -42,7 +42,7 @@ def seed_rng(pseed=None):
    numpy.random.seed(seed)
    return seed

-def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
+def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
    """
    Wrapper for tensor/basic.py:verify_grad
    Takes care of seeding the random number generator if None is given
@@ -50,5 +50,5 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps=1.0e-7, tol=0.0001):
    if rng is None:
        seed_rng()
        rng = numpy.random
-    T.verify_grad(op, pt, n_tests, rng, eps, tol)
+    T.verify_grad(op, pt, n_tests, rng, *args, **kwargs)