Merge branch 'master' into allow_cxx_flag_full_path

Conflicts: theano/gof/cmodule.py

Merge branch 'master' into allow_cxx_flag_full_path
5c25f307 · cocu · 76f2634a · 95d3add9 · 5c25f307 · 5c25f307
--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
+.. _libdoc_cuda_dnn:
+================================
+:mod:`sandbox.cuda.dnn` -- cuDNN
+================================
+.. moduleauthor:: LISA
+`cuDNN <https://developer.nvidia.com/cuDNN>`_ is an NVIDIA library with
+functionality used by deep neural network. It provides optimized versions
+of some operations like the convolution. cuDNN is not currently
+installed with CUDA 6.5. You must download and install it
+yourself.
+To install it, decompress the downloaded file and make the ``*.h`` and
+``*.so*`` files available to the compilation environment. On Linux,
+this can be done by setting the environment variables
+``LD_LIBRARY_PATH``, ``LIBRARY_PATH`` and ``CPATH`` to the
+uncompressed directory path. Separate multiple directory with ``:`` as
+the ``PATH`` environment variable. Or you can copy the ``*.h`` files
+to ``/usr/include`` and the ``*.so*`` files to ``/lib64``.
+By default, Theano will detect if it can use cuDNN. If so, it will use
+it.  If not, Theano optimizations will not introduce cuDNN ops. So
+Theano will still work if the user did not introduce them manually.
+To get an error if Theano can not use cuDNN, use this Theano flag:
+``optimizer_including=cudnn``.
+.. note::
+    Normally you should not call GPU Ops directly, but the CPU interface
+    currently does not allow all options supported by cuDNN ops. So it is
+    possible that you will need to call them manually.
+Functions
+=========
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: dnn_conv, dnn_pool
+Convolution Ops
+===============
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI,
+Pooling Ops
+===========
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: GpuDnnPoolDesc, GpuDnnPool, GpuDnnPoolGrad,
+Softmax Ops
+===========
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: GpuDnnSoftmax, GpuDnnSoftmaxGrad
--- a/doc/library/sandbox/cuda/index.txt
+++ b/doc/library/sandbox/cuda/index.txt
@@ -13,6 +13,7 @@
 .. toctree::
    :maxdepth: 1
+    op
    var
    type
-    op
+    dnn
--- a/setup.py
+++ b/setup.py
@@ -123,29 +123,13 @@ def git_version():
        git_revision = "unknown-git"
    return git_revision
-# Python 2.4 compatibility: Python versions 2.6 and later support new
-# exception syntax, but for now we have to resort to exec.
-if sys.hexversion >= 0x2070000:
-    exec("""\
 def write_text(filename, text):
-    with open(filename, 'w') as a:
    try:
+        with open(filename, 'w') as a:
            a.write(text)
    except Exception as e:
        print(e)
-""")
-else:
-    exec("""\
-def write_text(filename, text):
-    a = open(filename, 'w')
-    try:
-        try:
-            a.write(text)
-        except Exception, e:
-            print e
-    finally:
-        a.close()
-""")
 def write_version_py(filename=os.path.join('theano', 'generated_version.py')):

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1795,7 +1795,8 @@ class GCC_compiler(object):
        return cxxflags
    @staticmethod
-    def try_compile_tmp(src_code, tmp_prefix='', flags=(), try_run=False):
+    def try_compile_tmp(src_code, tmp_prefix='', flags=(),
+                        try_run=False, output=False):
        """Try to compile (and run) a test program.
        This is useful in various occasions, to check if libraries
@@ -1806,6 +1807,7 @@ class GCC_compiler(object):
        If try_run is False, returns the compilation status.
        If try_run is True, returns a (compile_status, run_status) pair.
+        If output is there, we append the stdout and stderr to the output.
        """
        if not theano.config.cxx:
            return False
@@ -1825,14 +1827,14 @@ class GCC_compiler(object):
                os.write(fd, src_code)
                os.close(fd)
                fd = None
-                p_ret = call_subprocess_Popen(
+                out, err, p_ret = output_subprocess_Popen(
                    [theano.config.cxx, path, '-o', exe_path] + flags)
                if p_ret != 0:
                    compilation_ok = False
                elif try_run:
                    # Try to execute the program
                    try:
-                        p_ret = call_subprocess_Popen([exe_path])
+                        out, err, p_ret = output_subprocess_Popen([exe_path])
                        run_ok = (p_ret == 0)
                    finally:
                        os.remove(exe_path)
@@ -1846,13 +1848,18 @@ class GCC_compiler(object):
        except OSError, e:
            compilation_ok = False
-        if not try_run:
+        if not try_run and not output:
            return compilation_ok
-        else:
+        elif not try_run and output:
+            return (compilation_ok, out, err)
+        elif not output:
            return (compilation_ok, run_ok)
+        else:
+            return (compilation_ok, run_ok, out, err)
    @staticmethod
-    def try_flags(flag_list):
+    def try_flags(flag_list, preambule="", body="",
+                  try_run=False, output=False):
        '''
        Try to compile a dummy file with these flags.
@@ -1863,13 +1870,16 @@ class GCC_compiler(object):
            return False
        code = b("""
+        %(preambule)s
        int main(int argc, char** argv)
        {
+            %(body)s
            return 0;
        }
-        """)
+        """ % locals())
        return GCC_compiler.try_compile_tmp(code, tmp_prefix='try_flags_',
-                flags=flag_list, try_run=False)
+                                            flags=flag_list, try_run=try_run,
+                                            output=output)
    @staticmethod
    def compile_str(module_name, src_code, location=None,

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1163,11 +1163,6 @@ def local_conv_fft_full(node):
        return
-# Needs to be registered before local_gpu_conv_legacy. Otherwise, it
-# will have priority over this optimization.  We want, if cudnn is
-# available and the GPU supports it, to use it.  Otherwise, the gemm
-# version should be used.  If the users want the legacy convolution,
-# they should use the Theano flag to disable the dnn and/or gemm version.
 @local_optimizer([GpuConv])
 def local_gpu_conv(node):
    """
@@ -1350,7 +1345,7 @@ conv_groupopt.register("conv_fft_valid", local_conv_fft_valid, 1)
 conv_groupopt.register("conv_fft_full", local_conv_fft_full, 1)
 # Use dnn if avail, so have the dnn tag to be able to disable it.
 conv_groupopt.register('local_gpu_conv', local_gpu_conv, 10,
-                       'fast_compile', 'fast_run', 'dnn')
+                       'fast_compile', 'fast_run', 'cudnn')
 conv_groupopt.register('local_conv_gemm', local_conv_gemm, 12,
                       'fast_compile', 'fast_run')

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
+import logging
+import unittest
 from nose.plugins.skip import SkipTest
 import numpy
-import unittest
 import theano
+from theano.compat.six import StringIO
 from theano.gof.python25 import any
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
@@ -85,7 +88,7 @@ def test_pooling_opt():
    f = theano.function(
        [x],
        max_pool_2d(x, ds=(2, 2)),
-        mode=mode_with_gpu.including("cudnn"))
+        mode=mode_with_gpu)
    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
                for n in f.maker.fgraph.toposort()])
@@ -97,3 +100,36 @@ def test_pooling_opt():
    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
                for n in f.maker.fgraph.toposort()])
+def test_dnn_tag():
+    """
+    We test that if cudnn isn't avail we crash and that if it is avail, we use it.
+    """
+    x = T.ftensor4()
+    old = theano.config.on_opt_error
+    theano.config.on_opt_error = "raise"
+    sio = StringIO()
+    handler = logging.StreamHandler(sio)
+    logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
+    # Silence original handler when intentionnally generating warning messages
+    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
+    raised = False
+    try:
+        f = theano.function(
+            [x],
+            max_pool_2d(x, ds=(2, 2)),
+            mode=mode_with_gpu.including("cudnn"))
+    except RuntimeError, e:
+        assert not cuda.dnn.dnn_available()
+        raised = True
+    finally:
+        theano.config.on_opt_error = old
+        logging.getLogger('theano.compile.tests.test_dnn').removeHandler(handler)
+        logging.getLogger('theano').addHandler(theano.logging_default_handler)
+    if not raised:
+        assert cuda.dnn.dnn_available()
+        assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
+                    for n in f.maker.fgraph.toposort()])
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -10,6 +10,7 @@ If you do want to rewrite these tests, bear in mind:
 """
 import unittest
+import numpy as np
 import theano
 from theano.gof import FunctionGraph
@@ -20,8 +21,12 @@ from theano.scalar.basic import (floats, float32, float64,
                                 ints, int8, int32, complex64,
                                 ComplexError, IntDiv, TrueDiv,
                                 Composite, add, div_proxy, clip,
-                                 and_, eq, neq, invert, mul)
+                                 and_, eq, neq, invert, mul, Scalar)
-import numpy
+from theano.scalar.basic import (
+    true_div, inv, log, log2, log10, log1p, exp, exp2, expm1, sqrt, deg2rad,
+    rad2deg, cos, arccos, sin, arcsin, tan, arctan, arctan2, cosh, arccosh,
+    sinh, arcsinh, tanh, arctanh)
 def inputs():
    return floats('xyz')
@@ -75,7 +80,7 @@ class test_ScalarOps(unittest.TestCase):
        g3 = theano.gradient.grad(a3, x)
        fn3 = gof.DualLinker().accept(FunctionGraph([x], [g3])).make_function()
-        rng = numpy.random.RandomState(utt.fetch_seed())
+        rng = np.random.RandomState(utt.fetch_seed())
        ntests = 50
        for i in xrange(ntests):
@@ -235,6 +240,128 @@ class test_logical(unittest.TestCase):
            self.assertTrue(fn(a,b) == ~a, (a,))
+# This class does not inherit from unittest.TestCase, because it would
+# interfere with the "yield" mechanism that automatically generates test, see
+# http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class
+# Therefore, it needs to be named "test_..." or "Test_...", so nose can pick
+# it up by name, otherwise the tests would not be executed.
+class test_upgrade_to_float(object):
+    # Test for Ops whose output has to be floating point, even when all
+    # inputs are ints.
+    # In particular, when the inputs are int8, the output should be
+    # at least float32, not float16.
+    unary_ops_vals = [
+        (inv, range(-127, 0) + range(1, 127)),
+        (sqrt, range(0, 128)),
+        (log, range(1, 128)),
+        (log2, range(1, 128)),
+        (log10, range(1, 128)),
+        (log1p, range(0, 128)),
+        (exp, range(-127, 89)),
+        (exp2, range(-127, 89)),
+        (expm1, range(-127, 89)),
+        (deg2rad, range(-127, 128)),
+        (rad2deg, range(-127, 128)),
+        (cos, range(-127, 128)),
+        (arccos, range(-1, 2)),
+        (cosh, range(-89, 90)),
+        (arccosh, range(1, 128)),
+        (sin, range(-127, 128)),
+        (arcsin, range(-1, 2)),
+        (sinh, range(-89, 90)),
+        (arcsinh, range(-127, 128)),
+        (tan, range(-3, 4)),
+        (arctan, range(-127, 128)),
+        (tanh, range(-127, 128)),
+        (arctanh, [0])]
+    binary_ops_vals = [
+        (arctan2, range(-127, 128), range(-127, 128))]
+    @staticmethod
+    def _test_unary(unary_op, x_range):
+        xi = int8('xi')
+        xf = float32('xf')
+        ei = unary_op(xi)
+        fi = theano.function([xi], ei)
+        ef = unary_op(xf)
+        ff = theano.function([xf], ef)
+        for x_val in x_range:
+            outi = fi(x_val)
+            outf = ff(x_val)
+            assert outi.dtype == outf.dtype, 'incorrect dtype'
+            assert np.allclose(outi, outf), 'insufficient precision'
+    @staticmethod
+    def _test_binary(binary_op, x_range, y_range):
+        xi = int8('xi')
+        yi = int8('yi')
+        xf = float32('xf')
+        yf = float32('yf')
+        ei = binary_op(xi, yi)
+        fi = theano.function([xi, yi], ei)
+        ef = binary_op(xf, yf)
+        ff = theano.function([xf, yf], ef)
+        for x_val in x_range:
+            for y_val in y_range:
+                outi = fi(x_val, y_val)
+                outf = ff(x_val, y_val)
+                assert outi.dtype == outf.dtype, 'incorrect dtype'
+                assert np.allclose(outi, outf), 'insufficient precision'
+    def test_true_div(self):
+        # true_div's upcast policy is not exactly "upgrade_to_float",
+        # so the test is a little bit different
+        x_range = range(-127, 128)
+        y_range = range(-127, 0) + range(1, 127)
+        xi = int8('xi')
+        yi = int8('yi')
+        xf = Scalar(theano.config.floatX)('xf')
+        yf = Scalar(theano.config.floatX)('yf')
+        ei = true_div(xi, yi)
+        fi = theano.function([xi, yi], ei)
+        ef = true_div(xf, yf)
+        ff = theano.function([xf, yf], ef)
+        for x_val in x_range:
+            for y_val in y_range:
+                outi = fi(x_val, y_val)
+                outf = ff(x_val, y_val)
+                assert outi.dtype == outf.dtype, 'incorrect dtype'
+                assert np.allclose(outi, outf), 'insufficient precision'
+    def test_unary(self):
+        # Automatically define all individual unary tests
+        for unary_op, x_range in self.unary_ops_vals:
+            test_name = 'test_%s' % unary_op.name
+            # Make a lambda function so we can name the test
+            test = lambda: self._test_unary(unary_op, x_range)
+            test.description = test_name
+            yield test
+    def test_binary(self):
+        # Automatically define all individual binary tests
+        for binary_op, x_range, y_range in self.binary_ops_vals:
+            test_name = 'test_%s' % binary_op.name
+            # Make a lambda function so we can name the test
+            test = lambda: self._test_binary(binary_op, x_range, y_range)
+            test.description = test_name
+            yield test
 class test_complex_mod(unittest.TestCase):
    """Make sure % fails on complex numbers."""

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1812,7 +1812,7 @@ def round(a, mode="half_away_from_zero"):
        raise Exception("round mode %s is not implemented." % mode)
-@_scal_elemwise_with_nfunc('around', 1, -1)
+@_scal_elemwise_with_nfunc('around', 1, 1)
 def round_half_to_even(a):
    """round_half_to_even(a)"""
@@ -1952,20 +1952,20 @@ def chi2sf(x, k):
 #numpy.real(float32) return a view on the inputs.
-#@_scal_elemwise_with_nfunc('real', 1, -1)
+#@_scal_elemwise_with_nfunc('real', 1, 1)
 @_scal_elemwise
 def real(z):
    """Return real component of complex-valued tensor `z`"""
 _tensor_py_operators.real = property(real)
-@_scal_elemwise_with_nfunc('imag', 1, -1)
+@_scal_elemwise_with_nfunc('imag', 1, 1)
 def imag(z):
    """Return imaginary component of complex-valued tensor `z`"""
 _tensor_py_operators.imag = property(imag)
-@_scal_elemwise_with_nfunc('angle', 1, -1)
+@_scal_elemwise_with_nfunc('angle', 1, 1)
 def angle(z):
    """Return polar-coordinate angle of complex-valued tensor `z`"""
@@ -1975,7 +1975,7 @@ def complex(real, imag):
    """Return complex-valued tensor with `real` and `imag` components"""
-@_scal_elemwise_with_nfunc('conj', 1, -1)
+@_scal_elemwise_with_nfunc('conj', 1, 1)
 def conj(z):
    """Return the complex conjugate of `z`."""

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -18,9 +18,10 @@ from theano.tensor import elemwise_cgen as cgen
 config = theano.config
-# We cannot import discrete_dtypes from tensor.basic yet,
+# We cannot import discrete_dtypes or float_dtypes from tensor.basic yet,
 # so we redefine them here
 discrete_dtypes = map(str, scalar.discrete_types)
+float_dtypes = map(str, scalar.float_types)
 # tensor depends on elemwise to provide definitions for several ops
@@ -472,14 +473,11 @@ class Elemwise(OpenMPOp):
            the input's storage. (Just like destroymap, but without the lists.)
        * nfunc_spec: either None or a tuple of three elements,
            (nfunc_name, nin, nout) such that getattr(numpy, nfunc_name)
-            implements this operation, takes nin inputs and abs(nout) outputs
+            implements this operation, takes nin inputs and nout outputs.
-            (nout < 0 if the numpy function does not provide the option of
+            Note that nin cannot always be inferred from the scalar op's
-            providing a numpy array to store the results in). Note that nin
+            own nin field because that value is sometimes 0 (meaning a
-            cannot always be inferred from the scalar op's own nin field
+            variable number of inputs), whereas the numpy function may
-            because that value is sometimes 0 (meaning a variable number of
+            not have varargs.
-            inputs), whereas the numpy function may not have varargs.
-            NOTE: as of now, the sign of the nout field is ignored (some work
-            needs to be done to resize the destinations when needed).
        """
        if inplace_pattern is None:
            inplace_pattern = {}
@@ -819,43 +817,24 @@ class Elemwise(OpenMPOp):
                out_shape.append(max(values))
        out_shape = tuple(out_shape)
-        # Commented as we don't reuse outputs now.
+        ufunc_args = inputs
-        #
+        ufunc_kwargs = {}
-        # if not self.inplace_pattern:
-        #     for output, storage in izip(node.outputs, output_storage):
-        #         odat = storage[0]
-        #         if odat is not None:
-        #             if odat.shape != out_shape:
-        #                 # It is unsafe to try to resize odat,
-        #                 # we have to allocate output storage.
-        #                 odat = None
-        #         if odat is None:
-        #             odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
-        #         storage[0] = odat
-        # else:
-        #     for i, (output, storage) in enumerate(
-        #             izip(node.outputs, output_storage)):
-        #         #i is an output idx
-        #         if i in self.inplace_pattern:
-        #             odat = inputs[self.inplace_pattern[i]]
-        #         else:
-        #             odat = storage[0]
-        #             if odat is not None:
-        #                 if odat.shape != out_shape:
-        #                     # It is unsafe to try to resize odat,
-        #                     # we have to allocate output storage.
-        #                     odat = None
-        #             if odat is None:
-        #                 odat = numpy.ndarray(out_shape,
-        #                         dtype=output.type.dtype)
-        #         storage[0] = odat
-        ufunc_args = inputs  # + output_storage
        if self.nfunc and len(inputs) == self.nfunc_spec[1]:
            ufunc = self.nfunc
            nout = self.nfunc_spec[2]
-            if nout < 0:
+            # Numpy ufuncs will sometimes perform operations in
-                nout = -nout
+            # float16, in particular when the input is int8.
+            # This is not something that we want, and we do not
+            # do it in the C code, so we specify that the computation
+            # should be carried out in the returned dtype.
+            # This is done via the "sig" kwarg of the ufunc, its value
+            # should be something like "ff->f", where the characters
+            # represent the dtype of the inputs and outputs.
+            out_dtype = node.outputs[0].dtype
+            if out_dtype in float_dtypes and isinstance(ufunc, numpy.ufunc):
+                char = numpy.sctype2char(out_dtype)
+                sig = char * node.nin + '->' + char * node.nout
+                ufunc_kwargs['sig'] = sig
            # Unfortunately, the else case does not allow us to
            # directly feed the destination arguments to the nfunc
            # since it sometimes requires resizing. Doing this
@@ -869,7 +848,7 @@ class Elemwise(OpenMPOp):
                                      self.scalar_op.nout))
            nout = ufunc.nout
-        variables = ufunc(*ufunc_args)
+        variables = ufunc(*ufunc_args, **ufunc_kwargs)
        if nout == 1:
            variables = [variables]

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -31,6 +31,11 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
            return 0.0
        if x > 30.0:
            return 1.0
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return 1.0 / (1.0 + numpy.exp(-x, sig='f'))
        return 1.0 / (1.0 + numpy.exp(-x))
    def impl(self, x):
@@ -268,8 +273,11 @@ def hard_sigmoid(x):
    Removing the slope and shift does not make it faster.
    """
-    slope = 0.2
+    # Use the same dtype as determined by "upgrade_to_float",
-    shift = 0.5
+    # and perform computation in that dtype.
+    out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype
+    slope = tensor.constant(0.2, dtype=out_dtype)
+    shift = tensor.constant(0.5, dtype=out_dtype)
    x = (x * slope) + shift
    x = tensor.clip(x, 0, 1)
    return x
@@ -300,6 +308,11 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
            return 0.0
        if x > 30.0:
            return x
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log1p(numpy.exp(x, sig='f'))
        return numpy.log1p(numpy.exp(x))
    def impl(self, x):

--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -16,7 +16,7 @@ from theano.tensor.nnet.sigm import (
    register_local_1msigmoid, simplify_mul,
 )
 from theano.tensor.tests.test_basic import (makeBroadcastTester, rand,
-                                            check_floatX,
+                                            check_floatX, upcast_int8_nfunc,
                                            _good_broadcast_unary_normal_no_complex)
@@ -30,8 +30,8 @@ class T_sigmoid(unittest.TestCase):
 SigmoidTester = makeBroadcastTester(
    op=sigmoid,
-    expected=lambda inputs: check_floatX(
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
    name='SigmoidTester',
@@ -39,8 +39,8 @@ SigmoidTester = makeBroadcastTester(
 UltraFastSigmoidTester = makeBroadcastTester(
    op=ultra_fast_sigmoid,
-    expected=lambda inputs: check_floatX(
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
    name='UltraFastSigmoidTester',
@@ -49,20 +49,21 @@ UltraFastSigmoidTester = makeBroadcastTester(
 HardSigmoidTester = makeBroadcastTester(
    op=hard_sigmoid,
-    expected=lambda inputs: check_floatX(
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
-    name='UltraFastSigmoidTester',
+    name='HardSigmoidTester',
 # This is an approx of the sigmoid. That is why we raise eps
    eps=1e-1)
 SoftplusTester = makeBroadcastTester(
    op=softplus,
-    expected=lambda inputs: check_floatX(
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
-        inputs, numpy.log1p(numpy.exp(inputs))),
+        inputs, numpy.log1p(numpy.exp(inputs)))),
-    good=_good_broadcast_unary_normal_no_complex,
+    good=dict(_good_broadcast_unary_normal_no_complex,
+              int8=[numpy.arange(-127, 89, dtype='int8')]),
    #grad=_grad_broadcast_unary_normal,
    name='SoftplusTester',
 )

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py