Merge branch 'master' into allow_cxx_flag_full_path

Conflicts: theano/gof/cmodule.py

Merge branch 'master' into allow_cxx_flag_full_path
5c25f307 · cocu · 76f2634a · 95d3add9 · 5c25f307 · 5c25f307
--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
+.. _libdoc_cuda_dnn:
+
+================================
+:mod:`sandbox.cuda.dnn` -- cuDNN
+================================
+
+.. moduleauthor:: LISA
+
+`cuDNN <https://developer.nvidia.com/cuDNN>`_ is an NVIDIA library with
+functionality used by deep neural network. It provides optimized versions
+of some operations like the convolution. cuDNN is not currently
+installed with CUDA 6.5. You must download and install it
+yourself.
+
+To install it, decompress the downloaded file and make the ``*.h`` and
+``*.so*`` files available to the compilation environment. On Linux,
+this can be done by setting the environment variables
+``LD_LIBRARY_PATH``, ``LIBRARY_PATH`` and ``CPATH`` to the
+uncompressed directory path. Separate multiple directory with ``:`` as
+the ``PATH`` environment variable. Or you can copy the ``*.h`` files
+to ``/usr/include`` and the ``*.so*`` files to ``/lib64``.
+
+By default, Theano will detect if it can use cuDNN. If so, it will use
+it.  If not, Theano optimizations will not introduce cuDNN ops. So
+Theano will still work if the user did not introduce them manually.
+
+To get an error if Theano can not use cuDNN, use this Theano flag:
+``optimizer_including=cudnn``.
+
+.. note::
+
+    Normally you should not call GPU Ops directly, but the CPU interface
+    currently does not allow all options supported by cuDNN ops. So it is
+    possible that you will need to call them manually.
+
+
+Functions
+=========
+
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: dnn_conv, dnn_pool
+
+Convolution Ops
+===============
+
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI,
+
+Pooling Ops
+===========
+
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: GpuDnnPoolDesc, GpuDnnPool, GpuDnnPoolGrad,
+
+Softmax Ops
+===========
+
+.. automodule:: theano.sandbox.cuda.dnn
+    :members: GpuDnnSoftmax, GpuDnnSoftmaxGrad
--- a/doc/library/sandbox/cuda/index.txt
+++ b/doc/library/sandbox/cuda/index.txt
@@ -13,6 +13,7 @@
 .. toctree::
    :maxdepth: 1

+    op
    var
    type
-    op
+    dnn
--- a/setup.py
+++ b/setup.py
@@ -123,29 +123,13 @@ def git_version():
        git_revision = "unknown-git"
    return git_revision

-# Python 2.4 compatibility: Python versions 2.6 and later support new
-# exception syntax, but for now we have to resort to exec.
-if sys.hexversion >= 0x2070000:
-    exec("""\
-def write_text(filename, text):
-    with open(filename, 'w') as a:
-        try:
-            a.write(text)
-        except Exception as e:
-            print(e)
-""")
-else:
-    exec("""\
+
 def write_text(filename, text):
-    a = open(filename, 'w')
    try:
-        try:
+        with open(filename, 'w') as a:
            a.write(text)
-        except Exception, e:
-            print e
-    finally:
-        a.close()
-""")
+    except Exception as e:
+        print(e)


 def write_version_py(filename=os.path.join('theano', 'generated_version.py')):

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1795,7 +1795,8 @@ class GCC_compiler(object):
        return cxxflags

    @staticmethod
-    def try_compile_tmp(src_code, tmp_prefix='', flags=(), try_run=False):
+    def try_compile_tmp(src_code, tmp_prefix='', flags=(),
+                        try_run=False, output=False):
        """Try to compile (and run) a test program.

        This is useful in various occasions, to check if libraries
@@ -1806,6 +1807,7 @@ class GCC_compiler(object):

        If try_run is False, returns the compilation status.
        If try_run is True, returns a (compile_status, run_status) pair.
+        If output is there, we append the stdout and stderr to the output.
        """
        if not theano.config.cxx:
            return False
@@ -1825,14 +1827,14 @@ class GCC_compiler(object):
                os.write(fd, src_code)
                os.close(fd)
                fd = None
-                p_ret = call_subprocess_Popen(
+                out, err, p_ret = output_subprocess_Popen(
                    [theano.config.cxx, path, '-o', exe_path] + flags)
                if p_ret != 0:
                    compilation_ok = False
                elif try_run:
                    # Try to execute the program
                    try:
-                        p_ret = call_subprocess_Popen([exe_path])
+                        out, err, p_ret = output_subprocess_Popen([exe_path])
                        run_ok = (p_ret == 0)
                    finally:
                        os.remove(exe_path)
@@ -1846,13 +1848,18 @@ class GCC_compiler(object):
        except OSError, e:
            compilation_ok = False

-        if not try_run:
+        if not try_run and not output:
            return compilation_ok
-        else:
+        elif not try_run and output:
+            return (compilation_ok, out, err)
+        elif not output:
            return (compilation_ok, run_ok)
+        else:
+            return (compilation_ok, run_ok, out, err)

    @staticmethod
-    def try_flags(flag_list):
+    def try_flags(flag_list, preambule="", body="",
+                  try_run=False, output=False):
        '''
        Try to compile a dummy file with these flags.

@@ -1863,13 +1870,16 @@ class GCC_compiler(object):
            return False

        code = b("""
+        %(preambule)s
        int main(int argc, char** argv)
        {
+            %(body)s
            return 0;
        }
-        """)
+        """ % locals())
        return GCC_compiler.try_compile_tmp(code, tmp_prefix='try_flags_',
-                flags=flag_list, try_run=False)
+                                            flags=flag_list, try_run=try_run,
+                                            output=output)

    @staticmethod
    def compile_str(module_name, src_code, location=None,

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
 import os

 import theano
-from theano import Apply, tensor
+from theano import Apply, gof, tensor
+from theano.gof import Optimizer
 from theano.gof.type import CDataType
 from theano.compat import PY3
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -12,6 +13,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
+from theano.sandbox.cuda.opt import register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

@@ -23,9 +25,35 @@ def dnn_available():
            dnn_available.msg = "Device not supported by cuDNN"
            dnn_available.avail = False
        else:
-            dnn_available.msg = "Can not find the cuDNN library"
-            dnn_available.avail = theano.gof.cmodule.GCC_compiler.try_flags(
-                ["-l", "cudnn"])
+            preambule = """
+#include <cudnn.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cudnn_helper.h>
+            """
+
+            body = """
+cudnnHandle_t _handle = NULL;
+cudnnStatus_t err;
+if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+  fprintf(stderr, "could not create cuDNN handle: %s",
+          cudnnGetErrorString(err));
+  return 1;
+}
+"""
+
+            comp, run, out, err = gof.cmodule.GCC_compiler.try_flags(
+                ["-l", "cudnn", "-I" + os.path.dirname(__file__)],
+                preambule=preambule, body=body,
+                try_run=True, output=True)
+
+            dnn_available.avail = comp and run
+            if dnn_available.avail:
+                dnn_available.msg = "cuDNN should work"
+            else:
+                dnn_available.msg = (
+                    "Theano is not able to use cuDNN. We got this error: \n" +
+                    err)
    return dnn_available.avail


@@ -54,14 +82,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
        """ % dict(var=var, err=err, desc=desc, fail=fail)


-def raise_no_dnn():
-    """ Raise a RuntimeError if cudnn can't be used"""
-    if not dnn_available():
-        raise RuntimeError(
-            "cuDNN optimization was enabled, but cuDNN is not available. " +
-            dnn_available.msg)
-
-
 class DnnBase(GpuOp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -88,7 +108,7 @@ cudnnHandle_t _handle = NULL;
        return ["""{
 cudnnStatus_t err;
 if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
+  PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %%s",
               cudnnGetErrorString(err));
  return %s;
 }
@@ -96,6 +116,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {


 class GpuDnnConvDesc(GpuOp):
+    """This Op builds a convolution descriptor for use in the other
+    convolution operations.
+
+    :param border_mode: 'valid' or 'full'
+    :param subsample: The subsample, tuple like (dx, dy)
+    :param conv_mode: 'conv' or 'cross'
+
+    """
    __props__ = ('border_mode', 'subsample', 'conv_mode')

    def c_headers(self):
@@ -266,6 +294,9 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
 }
 """ % dict(var=var, desc=desc, err=err, fail=fail)

+    def c_set_tensor4d(self, *arg):
+        return c_set_tensor4d(*arg)
+
    def c_code(self, node, name, inputs, outputs, sub):
        desc = inputs[2]
        out, = outputs
@@ -351,6 +382,14 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {


 class GpuDnnConv(GpuDnnConvBase):
+    """
+    The forward convolution.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
    conv_inputs = 'input', 'kerns'
    conv_output = 'output'
    conv_types = 'tensor4d', 'filter', 'tensor4d'
@@ -374,6 +413,15 @@ class GpuDnnConv(GpuDnnConvBase):


 class GpuDnnConvGradW(GpuDnnConvBase):
+    """
+    The convolution gradient with respect to the weights.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
+
    conv_inputs = 'input', 'output',
    conv_output = 'kerns'
    conv_types = 'tensor4d', 'tensor4d', 'filter'
@@ -382,6 +430,15 @@ class GpuDnnConvGradW(GpuDnnConvBase):


 class GpuDnnConvGradI(GpuDnnConvBase):
+    """
+    The convolution gradient with respect to the inputs.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
+
    conv_inputs = 'kerns', 'output',
    conv_output = 'input'
    conv_types = 'filter', 'tensor4d', 'tensor4d'
@@ -415,7 +472,15 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),


 class GpuDnnPoolDesc(GpuOp):
-    __props__ = ('mode', 'ws', 'stride')
+    """
+    This Op builds a pooling descriptor for use in the other
+    pooling operations.
+
+    :param ws: windows size
+    :param stride: (dx, dy)
+    :param mode: 'max' or 'average'
+    """
+    __props__ = ('ws', 'stride', 'mode')

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -486,13 +551,19 @@ class GpuDnnPoolDesc(GpuOp):


 class GpuDnnPool(DnnBase):
+    """
+    Pooling.
+
+    :param img: the image 4d tensor.
+    :param desc: the pooling descriptor.
+    """
    __props__ = ()

    def make_node(self, img, desc):
        img = as_cuda_ndarray_variable(img)
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
-        
+
        if not isinstance(desc.type, CDataType) \
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')
@@ -534,10 +605,10 @@ if (output%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output%(id)d); }
        out, = outputs

        set_in = c_set_tensor4d(inputs[0], "input" + str(sub['struct_id']),
-            'err' + name, sub['fail'])
+                                'err' + name, sub['fail'])

        set_out = c_set_tensor4d(out, "output" + str(sub['struct_id']),
-            'err' + name, sub['fail'])
+                                 'err' + name, sub['fail'])

        return """
 cudnnStatus_t err%(name)s;
@@ -612,6 +683,14 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {


 class GpuDnnPoolGrad(DnnBase):
+    """
+    The pooling gradient.
+
+    :param inp: the input of the pooling.
+    :param inp_grad: same size as out, but is the corresponding gradient information.
+    :param out: the output of the pooling in the forward.
+    :param desc: The pooling descriptor.
+    """
    __props__ = ()

    def make_node(self, inp, inp_grad, out, desc):
@@ -622,7 +701,7 @@ class GpuDnnPoolGrad(DnnBase):
        inp_grad = as_cuda_ndarray_variable(inp_grad)
        if inp_grad.type.ndim != 4:
            raise TypeError('inp_grad must be 4D tensor')
-                
+
        out = as_cuda_ndarray_variable(out)
        if out.type.ndim != 4:
            raise TypeError('out must be 4D tensor')
@@ -685,15 +764,15 @@ if (output_grad%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output_grad%(id)

        set_in = "\n".join([
            c_set_tensor4d(inp, "input" + str(sub['struct_id']),
-                'err' + name, sub['fail']),
+                           'err' + name, sub['fail']),
            c_set_tensor4d(inp_grad, "input_grad" + str(sub['struct_id']),
-                'err' + name, sub['fail']),
+                           'err' + name, sub['fail']),
            c_set_tensor4d(out, "output" + str(sub['struct_id']),
-                'err' + name, sub['fail'])
+                           'err' + name, sub['fail'])
        ])

        set_out = c_set_tensor4d(out, "output_grad" + str(sub['struct_id']),
-            'err' + name, sub['fail'])
+                                 'err' + name, sub['fail'])

        return """
 cudnnStatus_t err%(name)s;
@@ -735,7 +814,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
-""" % dict(output_grad=out_grad, desc=desc, fail=sub['fail'], id=sub['struct_id'],
+""" % dict(output_grad=out_grad, desc=desc,
+           fail=sub['fail'], id=sub['struct_id'],
           name=name, set_in=set_in,
           set_out=set_out, input=inp, input_grad=inp_grad, output=out,
           input_desc="input"+str(sub['struct_id']),
@@ -773,13 +853,12 @@ class GpuDnnSoftmax(DnnBase):
    """
    Op for the cuDNN Softmax.

-    Parameters''
-    -tensor_format: Whether the data format is 'bc01' or 'b01c'
-    -algo: 'fast' or 'accurate' indicating whether computations should be
-    optimized for speed or accuracy respectively.
-    -mode: 'instance' or 'channel' indicating whether the softmax should be
-    computed per image across 'c01' or per spationali location '01' per image
-    across 'c'.
+    :param tensor_format: Whether the data format is 'bc01' or 'b01c'
+    :param algo: 'fast' or 'accurate' indicating whether computations should be
+        optimized for speed or accuracy respectively.
+    :param mode: 'instance' or 'channel' indicating whether the softmax should
+        be computed per image across 'c01' or per spationali location '01' per
+        image across 'c'.
    """

    __props__ = ('tensor_format', 'mode', 'algo')
@@ -924,11 +1003,14 @@ err%(name)s = cudnnSoftmaxForward(
 # We need this since other stuff from opt is not importable.
 if cuda_available:

-    from theano.sandbox.cuda.opt import local_optimizer, gpu_optimizer
+    from theano.sandbox.cuda.opt import (
+        local_optimizer, gpu_optimizer, gpu_seqopt)

+    @register_opt('cudnn')
    @local_optimizer([GpuConv])
    def local_conv_dnn(node):
-        raise_no_dnn()
+        if not dnn_available():
+            return
        if isinstance(node.op, GpuConv):
            if node.op.border_mode not in ['full', 'valid']:
                return
@@ -938,11 +1020,11 @@ if cuda_available:
            return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern),
                             border_mode=border_mode, subsample=subsample)]

-    gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
-
-
+    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMax])
    def local_pool_dnn(node):
+        if not dnn_available():
+            return
        if isinstance(node.op, GpuDownsampleFactorMax):
            if node.op.ignore_border:
                return
@@ -950,32 +1032,43 @@ if cuda_available:
            ds = node.op.ds
            return [dnn_pool(gpu_contiguous(img), ds, ds)]

-    gpu_optimizer.register("pool_cudnn", local_pool_dnn, 'cudnn')
-
-
+    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMaxGrad])
    def local_pool_dnn_grad(node):
+        if not dnn_available():
+            return
        if isinstance(node.op, GpuDownsampleFactorMaxGrad):
            if node.op.ignore_border:
                return
            inp, out, inp_grad = node.inputs
            ds = node.op.ds
-            
-            desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
-            
-            return [GpuDnnPoolGrad()(gpu_contiguous(inp),
-                gpu_contiguous(inp_grad), gpu_contiguous(out), desc)]

-    gpu_optimizer.register("pool_cudnn_grad", local_pool_dnn_grad, 'cudnn')
+            desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()

+            return [GpuDnnPoolGrad()(gpu_contiguous(inp),
+                                     gpu_contiguous(inp_grad),
+                                     gpu_contiguous(out), desc)]

+    @register_opt('cudnn')
    @local_optimizer([GpuSoftmax])
    def local_softmax_dnn(node):
-        raise_no_dnn()
+        if not dnn_available():
+            return
        if isinstance(node.op, GpuSoftmax):
            ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
-            out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(gpu_contiguous(ins))
+            ins = gpu_contiguous(ins)
+            out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
            out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
            return [out]

-    gpu_optimizer.register("softmax_cudnn", local_softmax_dnn, 'cudnn')
+    class NoCuDNNRaise(Optimizer):
+        def apply(self, fgraph):
+            """ Raise a RuntimeError if cudnn can't be used"""
+            if not dnn_available():
+                # Make an assert error as we want Theano to fail, not
+                # just skip this optimization.
+                raise AssertionError(
+                    "cuDNN optimization was enabled, but Theano was not able"
+                    " to use it. We got this error: \n" +
+                    dnn_available.msg)
+    gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1163,11 +1163,6 @@ def local_conv_fft_full(node):
        return


-# Needs to be registered before local_gpu_conv_legacy. Otherwise, it
-# will have priority over this optimization.  We want, if cudnn is
-# available and the GPU supports it, to use it.  Otherwise, the gemm
-# version should be used.  If the users want the legacy convolution,
-# they should use the Theano flag to disable the dnn and/or gemm version.
 @local_optimizer([GpuConv])
 def local_gpu_conv(node):
    """
@@ -1350,7 +1345,7 @@ conv_groupopt.register("conv_fft_valid", local_conv_fft_valid, 1)
 conv_groupopt.register("conv_fft_full", local_conv_fft_full, 1)
 # Use dnn if avail, so have the dnn tag to be able to disable it.
 conv_groupopt.register('local_gpu_conv', local_gpu_conv, 10,
-                       'fast_compile', 'fast_run', 'dnn')
+                       'fast_compile', 'fast_run', 'cudnn')
 conv_groupopt.register('local_conv_gemm', local_conv_gemm, 12,
                       'fast_compile', 'fast_run')


--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
+import logging
+import unittest
+
 from nose.plugins.skip import SkipTest
 import numpy
-import unittest

 import theano
+from theano.compat.six import StringIO
 from theano.gof.python25 import any
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
@@ -85,7 +88,7 @@ def test_pooling_opt():
    f = theano.function(
        [x],
        max_pool_2d(x, ds=(2, 2)),
-        mode=mode_with_gpu.including("cudnn"))
+        mode=mode_with_gpu)

    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
                for n in f.maker.fgraph.toposort()])
@@ -97,3 +100,36 @@ def test_pooling_opt():

    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
                for n in f.maker.fgraph.toposort()])
+
+
+def test_dnn_tag():
+    """
+    We test that if cudnn isn't avail we crash and that if it is avail, we use it.
+    """
+    x = T.ftensor4()
+    old = theano.config.on_opt_error
+    theano.config.on_opt_error = "raise"
+
+    sio = StringIO()
+    handler = logging.StreamHandler(sio)
+    logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
+    # Silence original handler when intentionnally generating warning messages
+    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
+    raised = False
+    try:
+        f = theano.function(
+            [x],
+            max_pool_2d(x, ds=(2, 2)),
+            mode=mode_with_gpu.including("cudnn"))
+    except RuntimeError, e:
+        assert not cuda.dnn.dnn_available()
+        raised = True
+    finally:
+        theano.config.on_opt_error = old
+        logging.getLogger('theano.compile.tests.test_dnn').removeHandler(handler)
+        logging.getLogger('theano').addHandler(theano.logging_default_handler)
+
+    if not raised:
+        assert cuda.dnn.dnn_available()
+        assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
+                    for n in f.maker.fgraph.toposort()])
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1504,7 +1504,7 @@ class TrueDiv(BinaryScalarOp):
        x = numpy.asarray(x)
        y = numpy.asarray(y)
        if all(a.dtype in discrete_types for a in (x, y)):
-            return numpy.array(float(x) / y, dtype=config.floatX)
+            return numpy.sctypeDict[config.floatX](float(x) / y)
        else:
            return x / y

@@ -2166,7 +2166,7 @@ neg = Neg(same_out, name='neg')
 class Inv(UnaryScalarOp):
    """ multiplicative inverse. Also called reciprocal"""
    def impl(self, x):
-        return 1.0 / x
+        return numpy.float32(1.0) / x

    def grad(self, (x,), (gz,)):
        if x.type in complex_types:
@@ -2180,6 +2180,8 @@ class Inv(UnaryScalarOp):
        return -gz / (x * x),

    def c_code(self, node, name, (x,), (z,), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = 1.0 / %(x)s;" % locals()
 inv = Inv(upgrade_to_float, name='inv')

@@ -2190,6 +2192,11 @@ class Log(UnaryScalarOp):
    amd_float64 = "amd_vrda_log"

    def impl(self, x):
+        # If x is an int8 or uint8, numpy.log will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log(x, sig='f')
        return numpy.log(x)

    def grad(self, (x,), (gz,)):
@@ -2219,6 +2226,11 @@ class Log2(UnaryScalarOp):
    amd_float64 = "amd_vrda_log2"

    def impl(self, x):
+        # If x is an int8 or uint8, numpy.log2 will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log2(x, sig='f')
        return numpy.log2(x)

    def grad(self, (x,), (gz,)):
@@ -2245,6 +2257,11 @@ class Log10(UnaryScalarOp):
    amd_float64 = "amd_vrda_log10"

    def impl(self, x):
+        # If x is an int8 or uint8, numpy.log10 will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log10(x, sig='f')
        return numpy.log10(x)

    def grad(self, (x,), (gz,)):
@@ -2268,6 +2285,11 @@ log10 = Log10(upgrade_to_float, name='log10')
 class Log1p(UnaryScalarOp):
    """ log(1+x) """
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.log1p will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log1p(x, sig='f')
        return numpy.log1p(x)

    def grad(self, (x,), (gz,)):
@@ -2293,6 +2315,11 @@ class Exp(UnaryScalarOp):
    amd_float64 = "amd_vrda_exp"

    def impl(self, x):
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.exp(x, sig='f')
        return numpy.exp(x)

    def grad(self, (x, ), (gz, )):
@@ -2315,6 +2342,11 @@ exp = Exp(upgrade_to_float, name='exp')

 class Exp2(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.exp2 will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.exp2(x, sig='f')
        return numpy.exp2(x)

    def grad(self, (x, ), (gz, )):
@@ -2337,6 +2369,11 @@ exp2 = Exp2(upgrade_to_float, name='exp2')

 class Expm1(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.expm1 will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.expm1(x, sig='f')
        return numpy.expm1(x)

    def grad(self, (x, ), (gz, )):
@@ -2382,6 +2419,11 @@ sqr = Sqr(same_out, name='sqr')

 class Sqrt(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.sqrt will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.sqrt(x, sig='f')
        return numpy.sqrt(x)

    def grad(self, (x,), (gz,)):
@@ -2404,6 +2446,11 @@ sqrt = Sqrt(upgrade_to_float, name='sqrt')

 class Deg2Rad(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.deg2rad will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.deg2rad(x, sig='f')
        return numpy.deg2rad(x)

    def grad(self, (x,), (gz,)):
@@ -2426,6 +2473,11 @@ deg2rad = Deg2Rad(upgrade_to_float, name='deg2rad')

 class Rad2Deg(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.rad2deg will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.rad2deg(x, sig='f')
        return numpy.rad2deg(x)

    def grad(self, (x,), (gz,)):
@@ -2451,6 +2503,11 @@ class Cos(UnaryScalarOp):
    amd_float64 = "amd_vrda_cos"

    def impl(self, x):
+        # If x is an int8 or uint8, numpy.cos will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.cos(x, sig='f')
        return numpy.cos(x)

    def grad(self, (x, ), (gz, )):
@@ -2473,6 +2530,11 @@ cos = Cos(upgrade_to_float, name='cos')

 class ArcCos(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.arccos will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.arccos(x, sig='f')
        return numpy.arccos(x)

    def grad(self, (x,), (gz,)):
@@ -2498,6 +2560,11 @@ class Sin(UnaryScalarOp):
    amd_float64 = "amd_vrda_sin"

    def impl(self, x):
+        # If x is an int8 or uint8, numpy.sin will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.sin(x, sig='f')
        return numpy.sin(x)

    def grad(self, (x, ), (gz, )):
@@ -2520,6 +2587,11 @@ sin = Sin(upgrade_to_float, name='sin')

 class ArcSin(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.arcsin will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.arcsin(x, sig='f')
        return numpy.arcsin(x)

    def grad(self, (x,), (gz,)):
@@ -2542,6 +2614,11 @@ arcsin = ArcSin(upgrade_to_float, name='arcsin')

 class Tan(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.tan will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.tan(x, sig='f')
        return numpy.tan(x)

    def grad(self, (x,), (gz,)):
@@ -2564,6 +2641,11 @@ tan = Tan(upgrade_to_float, name='tan')

 class ArcTan(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.arctan will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.arctan(x, sig='f')
        return numpy.arctan(x)

    def grad(self, (x,), (gz,)):
@@ -2586,6 +2668,13 @@ arctan = ArcTan(upgrade_to_float, name='arctan')

 class ArcTan2(BinaryScalarOp):
    def impl(self, y, x):
+        # If x and y are int8 or uint8, numpy.arctan2 will compute the result
+        # in half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            y_dtype = str(getattr(x, 'dtype', ''))
+            if y_dtype in ('int8', 'uint8'):
+                return numpy.arctan2(y, x, sig='f')
        return numpy.arctan2(y, x)

    def grad(self, (y, x), (gz,)):
@@ -2621,6 +2710,11 @@ class Cosh(UnaryScalarOp):
    cosh(x) = (exp(x) + exp(-x)) / 2
    """
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.cosh will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.cosh(x, sig='f')
        return numpy.cosh(x)

    def grad(self, (x, ), (gz, )):
@@ -2643,6 +2737,11 @@ cosh = Cosh(upgrade_to_float, name='cosh')

 class ArcCosh(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.arccosh will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.arccosh(x, sig='f')
        return numpy.arccosh(x)

    def grad(self, (x, ), (gz, )):
@@ -2668,6 +2767,11 @@ class Sinh(UnaryScalarOp):
    sinh(x) = (exp(x) - exp(-x)) / 2
    """
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.sinh will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.sinh(x, sig='f')
        return numpy.sinh(x)

    def grad(self, (x, ), (gz, )):
@@ -2690,6 +2794,11 @@ sinh = Sinh(upgrade_to_float, name='sinh')

 class ArcSinh(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.arcsinh will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.arcsinh(x, sig='f')
        return numpy.arcsinh(x)

    def grad(self, (x, ), (gz, )):
@@ -2716,6 +2825,11 @@ class Tanh(UnaryScalarOp):
            = (exp(2*x) - 1) / (exp(2*x) + 1)
    """
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.tanh will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.tanh(x, sig='f')
        return numpy.tanh(x)

    def grad(self, (x, ), (gz, )):
@@ -2738,6 +2852,11 @@ tanh = Tanh(upgrade_to_float, name='tanh')

 class ArcTanh(UnaryScalarOp):
    def impl(self, x):
+        # If x is an int8 or uint8, numpy.arctanh will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.arctanh(x, sig='f')
        return numpy.arctanh(x)

    def grad(self, (x, ), (gz, )):

--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -10,6 +10,7 @@ If you do want to rewrite these tests, bear in mind:
 """

 import unittest
+import numpy as np

 import theano
 from theano.gof import FunctionGraph
@@ -20,8 +21,12 @@ from theano.scalar.basic import (floats, float32, float64,
                                 ints, int8, int32, complex64,
                                 ComplexError, IntDiv, TrueDiv,
                                 Composite, add, div_proxy, clip,
-                                 and_, eq, neq, invert, mul)
-import numpy
+                                 and_, eq, neq, invert, mul, Scalar)
+from theano.scalar.basic import (
+    true_div, inv, log, log2, log10, log1p, exp, exp2, expm1, sqrt, deg2rad,
+    rad2deg, cos, arccos, sin, arcsin, tan, arctan, arctan2, cosh, arccosh,
+    sinh, arcsinh, tanh, arctanh)
+

 def inputs():
    return floats('xyz')
@@ -75,7 +80,7 @@ class test_ScalarOps(unittest.TestCase):
        g3 = theano.gradient.grad(a3, x)
        fn3 = gof.DualLinker().accept(FunctionGraph([x], [g3])).make_function()

-        rng = numpy.random.RandomState(utt.fetch_seed())
+        rng = np.random.RandomState(utt.fetch_seed())

        ntests = 50
        for i in xrange(ntests):
@@ -235,6 +240,128 @@ class test_logical(unittest.TestCase):
            self.assertTrue(fn(a,b) == ~a, (a,))


+# This class does not inherit from unittest.TestCase, because it would
+# interfere with the "yield" mechanism that automatically generates test, see
+# http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class
+# Therefore, it needs to be named "test_..." or "Test_...", so nose can pick
+# it up by name, otherwise the tests would not be executed.
+class test_upgrade_to_float(object):
+    # Test for Ops whose output has to be floating point, even when all
+    # inputs are ints.
+    # In particular, when the inputs are int8, the output should be
+    # at least float32, not float16.
+
+    unary_ops_vals = [
+        (inv, range(-127, 0) + range(1, 127)),
+        (sqrt, range(0, 128)),
+        (log, range(1, 128)),
+        (log2, range(1, 128)),
+        (log10, range(1, 128)),
+        (log1p, range(0, 128)),
+        (exp, range(-127, 89)),
+        (exp2, range(-127, 89)),
+        (expm1, range(-127, 89)),
+        (deg2rad, range(-127, 128)),
+        (rad2deg, range(-127, 128)),
+        (cos, range(-127, 128)),
+        (arccos, range(-1, 2)),
+        (cosh, range(-89, 90)),
+        (arccosh, range(1, 128)),
+        (sin, range(-127, 128)),
+        (arcsin, range(-1, 2)),
+        (sinh, range(-89, 90)),
+        (arcsinh, range(-127, 128)),
+        (tan, range(-3, 4)),
+        (arctan, range(-127, 128)),
+        (tanh, range(-127, 128)),
+        (arctanh, [0])]
+
+    binary_ops_vals = [
+        (arctan2, range(-127, 128), range(-127, 128))]
+
+    @staticmethod
+    def _test_unary(unary_op, x_range):
+        xi = int8('xi')
+        xf = float32('xf')
+
+        ei = unary_op(xi)
+        fi = theano.function([xi], ei)
+
+        ef = unary_op(xf)
+        ff = theano.function([xf], ef)
+
+        for x_val in x_range:
+            outi = fi(x_val)
+            outf = ff(x_val)
+
+            assert outi.dtype == outf.dtype, 'incorrect dtype'
+            assert np.allclose(outi, outf), 'insufficient precision'
+
+    @staticmethod
+    def _test_binary(binary_op, x_range, y_range):
+        xi = int8('xi')
+        yi = int8('yi')
+        xf = float32('xf')
+        yf = float32('yf')
+
+        ei = binary_op(xi, yi)
+        fi = theano.function([xi, yi], ei)
+
+        ef = binary_op(xf, yf)
+        ff = theano.function([xf, yf], ef)
+
+        for x_val in x_range:
+            for y_val in y_range:
+                outi = fi(x_val, y_val)
+                outf = ff(x_val, y_val)
+
+                assert outi.dtype == outf.dtype, 'incorrect dtype'
+                assert np.allclose(outi, outf), 'insufficient precision'
+
+    def test_true_div(self):
+        # true_div's upcast policy is not exactly "upgrade_to_float",
+        # so the test is a little bit different
+        x_range = range(-127, 128)
+        y_range = range(-127, 0) + range(1, 127)
+
+        xi = int8('xi')
+        yi = int8('yi')
+        xf = Scalar(theano.config.floatX)('xf')
+        yf = Scalar(theano.config.floatX)('yf')
+
+        ei = true_div(xi, yi)
+        fi = theano.function([xi, yi], ei)
+
+        ef = true_div(xf, yf)
+        ff = theano.function([xf, yf], ef)
+
+        for x_val in x_range:
+            for y_val in y_range:
+                outi = fi(x_val, y_val)
+                outf = ff(x_val, y_val)
+
+                assert outi.dtype == outf.dtype, 'incorrect dtype'
+                assert np.allclose(outi, outf), 'insufficient precision'
+
+    def test_unary(self):
+        # Automatically define all individual unary tests
+        for unary_op, x_range in self.unary_ops_vals:
+            test_name = 'test_%s' % unary_op.name
+            # Make a lambda function so we can name the test
+            test = lambda: self._test_unary(unary_op, x_range)
+            test.description = test_name
+            yield test
+
+    def test_binary(self):
+        # Automatically define all individual binary tests
+        for binary_op, x_range, y_range in self.binary_ops_vals:
+            test_name = 'test_%s' % binary_op.name
+            # Make a lambda function so we can name the test
+            test = lambda: self._test_binary(binary_op, x_range, y_range)
+            test.description = test_name
+            yield test
+
+
 class test_complex_mod(unittest.TestCase):
    """Make sure % fails on complex numbers."""


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1812,7 +1812,7 @@ def round(a, mode="half_away_from_zero"):
        raise Exception("round mode %s is not implemented." % mode)


-@_scal_elemwise_with_nfunc('around', 1, -1)
+@_scal_elemwise_with_nfunc('around', 1, 1)
 def round_half_to_even(a):
    """round_half_to_even(a)"""

@@ -1952,20 +1952,20 @@ def chi2sf(x, k):


 #numpy.real(float32) return a view on the inputs.
-#@_scal_elemwise_with_nfunc('real', 1, -1)
+#@_scal_elemwise_with_nfunc('real', 1, 1)
 @_scal_elemwise
 def real(z):
    """Return real component of complex-valued tensor `z`"""
 _tensor_py_operators.real = property(real)


-@_scal_elemwise_with_nfunc('imag', 1, -1)
+@_scal_elemwise_with_nfunc('imag', 1, 1)
 def imag(z):
    """Return imaginary component of complex-valued tensor `z`"""
 _tensor_py_operators.imag = property(imag)


-@_scal_elemwise_with_nfunc('angle', 1, -1)
+@_scal_elemwise_with_nfunc('angle', 1, 1)
 def angle(z):
    """Return polar-coordinate angle of complex-valued tensor `z`"""

@@ -1975,7 +1975,7 @@ def complex(real, imag):
    """Return complex-valued tensor with `real` and `imag` components"""


-@_scal_elemwise_with_nfunc('conj', 1, -1)
+@_scal_elemwise_with_nfunc('conj', 1, 1)
 def conj(z):
    """Return the complex conjugate of `z`."""


--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -18,9 +18,10 @@ from theano.tensor import elemwise_cgen as cgen

 config = theano.config

-# We cannot import discrete_dtypes from tensor.basic yet,
+# We cannot import discrete_dtypes or float_dtypes from tensor.basic yet,
 # so we redefine them here
 discrete_dtypes = map(str, scalar.discrete_types)
+float_dtypes = map(str, scalar.float_types)


 # tensor depends on elemwise to provide definitions for several ops
@@ -472,14 +473,11 @@ class Elemwise(OpenMPOp):
            the input's storage. (Just like destroymap, but without the lists.)
        * nfunc_spec: either None or a tuple of three elements,
            (nfunc_name, nin, nout) such that getattr(numpy, nfunc_name)
-            implements this operation, takes nin inputs and abs(nout) outputs
-            (nout < 0 if the numpy function does not provide the option of
-            providing a numpy array to store the results in). Note that nin
-            cannot always be inferred from the scalar op's own nin field
-            because that value is sometimes 0 (meaning a variable number of
-            inputs), whereas the numpy function may not have varargs.
-            NOTE: as of now, the sign of the nout field is ignored (some work
-            needs to be done to resize the destinations when needed).
+            implements this operation, takes nin inputs and nout outputs.
+            Note that nin cannot always be inferred from the scalar op's
+            own nin field because that value is sometimes 0 (meaning a
+            variable number of inputs), whereas the numpy function may
+            not have varargs.
        """
        if inplace_pattern is None:
            inplace_pattern = {}
@@ -819,43 +817,24 @@ class Elemwise(OpenMPOp):
                out_shape.append(max(values))
        out_shape = tuple(out_shape)

-        # Commented as we don't reuse outputs now.
-        #
-        # if not self.inplace_pattern:
-        #     for output, storage in izip(node.outputs, output_storage):
-        #         odat = storage[0]
-        #         if odat is not None:
-        #             if odat.shape != out_shape:
-        #                 # It is unsafe to try to resize odat,
-        #                 # we have to allocate output storage.
-        #                 odat = None
-        #         if odat is None:
-        #             odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
-        #         storage[0] = odat
-        # else:
-        #     for i, (output, storage) in enumerate(
-        #             izip(node.outputs, output_storage)):
-        #         #i is an output idx
-        #         if i in self.inplace_pattern:
-        #             odat = inputs[self.inplace_pattern[i]]
-        #         else:
-        #             odat = storage[0]
-        #             if odat is not None:
-        #                 if odat.shape != out_shape:
-        #                     # It is unsafe to try to resize odat,
-        #                     # we have to allocate output storage.
-        #                     odat = None
-        #             if odat is None:
-        #                 odat = numpy.ndarray(out_shape,
-        #                         dtype=output.type.dtype)
-        #         storage[0] = odat
-
-        ufunc_args = inputs  # + output_storage
+        ufunc_args = inputs
+        ufunc_kwargs = {}
        if self.nfunc and len(inputs) == self.nfunc_spec[1]:
            ufunc = self.nfunc
            nout = self.nfunc_spec[2]
-            if nout < 0:
-                nout = -nout
+            # Numpy ufuncs will sometimes perform operations in
+            # float16, in particular when the input is int8.
+            # This is not something that we want, and we do not
+            # do it in the C code, so we specify that the computation
+            # should be carried out in the returned dtype.
+            # This is done via the "sig" kwarg of the ufunc, its value
+            # should be something like "ff->f", where the characters
+            # represent the dtype of the inputs and outputs.
+            out_dtype = node.outputs[0].dtype
+            if out_dtype in float_dtypes and isinstance(ufunc, numpy.ufunc):
+                char = numpy.sctype2char(out_dtype)
+                sig = char * node.nin + '->' + char * node.nout
+                ufunc_kwargs['sig'] = sig
            # Unfortunately, the else case does not allow us to
            # directly feed the destination arguments to the nfunc
            # since it sometimes requires resizing. Doing this
@@ -869,7 +848,7 @@ class Elemwise(OpenMPOp):
                                      self.scalar_op.nout))
            nout = ufunc.nout

-        variables = ufunc(*ufunc_args)
+        variables = ufunc(*ufunc_args, **ufunc_kwargs)

        if nout == 1:
            variables = [variables]

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -31,6 +31,11 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
            return 0.0
        if x > 30.0:
            return 1.0
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return 1.0 / (1.0 + numpy.exp(-x, sig='f'))
        return 1.0 / (1.0 + numpy.exp(-x))

    def impl(self, x):
@@ -268,8 +273,11 @@ def hard_sigmoid(x):
    Removing the slope and shift does not make it faster.

    """
-    slope = 0.2
-    shift = 0.5
+    # Use the same dtype as determined by "upgrade_to_float",
+    # and perform computation in that dtype.
+    out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype
+    slope = tensor.constant(0.2, dtype=out_dtype)
+    shift = tensor.constant(0.5, dtype=out_dtype)
    x = (x * slope) + shift
    x = tensor.clip(x, 0, 1)
    return x
@@ -300,6 +308,11 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
            return 0.0
        if x > 30.0:
            return x
+        # If x is an int8 or uint8, numpy.exp will compute the result in
+        # half-precision (float16), where we want float32.
+        x_dtype = str(getattr(x, 'dtype', ''))
+        if x_dtype in ('int8', 'uint8'):
+            return numpy.log1p(numpy.exp(x, sig='f'))
        return numpy.log1p(numpy.exp(x))

    def impl(self, x):

--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -16,7 +16,7 @@ from theano.tensor.nnet.sigm import (
    register_local_1msigmoid, simplify_mul,
 )
 from theano.tensor.tests.test_basic import (makeBroadcastTester, rand,
-                                            check_floatX,
+                                            check_floatX, upcast_int8_nfunc,
                                            _good_broadcast_unary_normal_no_complex)


@@ -30,8 +30,8 @@ class T_sigmoid(unittest.TestCase):

 SigmoidTester = makeBroadcastTester(
    op=sigmoid,
-    expected=lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
    name='SigmoidTester',
@@ -39,8 +39,8 @@ SigmoidTester = makeBroadcastTester(

 UltraFastSigmoidTester = makeBroadcastTester(
    op=ultra_fast_sigmoid,
-    expected=lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
    name='UltraFastSigmoidTester',
@@ -49,20 +49,21 @@ UltraFastSigmoidTester = makeBroadcastTester(

 HardSigmoidTester = makeBroadcastTester(
    op=hard_sigmoid,
-    expected=lambda inputs: check_floatX(
-        inputs, 1/(1+numpy.exp(-inputs))),
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, 1 / (1 + numpy.exp(-inputs)))),
    good=_good_broadcast_unary_normal_no_complex,
    #grad=_grad_broadcast_unary_normal,
-    name='UltraFastSigmoidTester',
+    name='HardSigmoidTester',
 # This is an approx of the sigmoid. That is why we raise eps
    eps=1e-1)


 SoftplusTester = makeBroadcastTester(
    op=softplus,
-    expected=lambda inputs: check_floatX(
-        inputs, numpy.log1p(numpy.exp(inputs))),
-    good=_good_broadcast_unary_normal_no_complex,
+    expected=upcast_int8_nfunc(lambda inputs: check_floatX(
+        inputs, numpy.log1p(numpy.exp(inputs)))),
+    good=dict(_good_broadcast_unary_normal_no_complex,
+              int8=[numpy.arange(-127, 89, dtype='int8')]),
    #grad=_grad_broadcast_unary_normal,
    name='SoftplusTester',
 )

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -189,6 +189,50 @@ def safe_make_node(op, *inputs):
        return node.owner


+def upcast_float16_ufunc(fn):
+    """Decorator that enforces computation is not done in float16 by NumPy.
+
+    Some ufuncs in NumPy will compute float values on int8 and uint8
+    in half-precision (float16), which is not enough, and not compatible
+    with the C code.
+
+    :param fn: numpy ufunc
+    :returns: function similar to fn.__call__, computing the same
+        value with a minimum floating-point precision of float32
+    """
+    def ret(*args, **kwargs):
+        out_dtype = numpy.find_common_type(
+            [a.dtype for a in args], [numpy.float16])
+        if out_dtype == 'float16':
+            # Force everything to float32
+            sig = 'f' * fn.nin + '->' + 'f' * fn.nout
+            kwargs.update(sig=sig)
+        return fn(*args, **kwargs)
+
+    return ret
+
+
+def upcast_int8_nfunc(fn):
+    """Decorator that upcasts input of dtype int8 to float32.
+
+    This is so that floating-point computation is not carried using
+    half-precision (float16), as some NumPy functions do.
+
+    :param fn: function computing a floating-point value from inputs
+    :returns: function similar to fn, but upcasting its uint8 and int8
+        inputs before carrying out the computation.
+    """
+    def ret(*args, **kwargs):
+        args = list(args)
+        for i, a in enumerate(args):
+            if getattr(a, 'dtype', None) in ('int8', 'uint8'):
+                args[i] = a.astype('float32')
+
+        return fn(*args, **kwargs)
+
+    return ret
+
+
 def makeTester(name, op, expected, checks=None, good=None, bad_build=None,
               bad_runtime=None, grad=None, mode=None, grad_rtol=None,
               eps=1e-10, skip=False, test_memmap=True, check_name=True):
@@ -321,7 +365,8 @@ def makeTester(name, op, expected, checks=None, good=None, bad_build=None,
                    expecteds = self.expected(*inputs)
                    eps = 1e-10

-                if any([i.dtype == 'float32' for i in inputs]):
+                if any([i.dtype in ('float32', 'int8', 'uint8')
+                        for i in inputs]):
                    eps = 1e-6
                eps = numpy.max([eps, _eps])

@@ -788,6 +833,9 @@ _good_broadcast_div_mod_normal_float_no_complex = dict(
    integer=(randint(2, 3), randint_nonzero(2, 3)),
    uinteger=(randint(2, 3).astype("uint8"),
              randint_nonzero(2, 3).astype("uint8")),
+    int8=[numpy.tile(numpy.arange(-127, 128, dtype='int8'), [254, 1]).T,
+          numpy.tile(numpy.array(range(-127, 0) + range(1, 128), dtype='int8'),
+                     [255, 1])],
    # This empty2 doesn't work for some tests. I don't remember why
    #empty2=(numpy.asarray([0]), numpy.asarray([])),
    )
@@ -853,7 +901,7 @@ def _numpy_true_div(x, y):
 TrueDivTester = makeBroadcastTester(
        op=tensor.true_div,
        expected=_numpy_true_div,
-        good=_good_broadcast_div_mod_normal_float,
+        good=_good_broadcast_div_mod_normal_float_no_complex,
        grad=_grad_broadcast_div_mod_normal,
        grad_rtol=div_grad_rtol,
        )
@@ -864,12 +912,48 @@ TrueDivInplaceTester = makeBroadcastTester(
        good=copymod(
            _good_broadcast_div_mod_normal_float_inplace,
            # The output is now in float, we cannot work inplace on an int.
-            without=['integer', 'uinteger']),
+            without=['integer', 'uinteger', 'int8']),
        grad=_grad_broadcast_div_mod_normal,
        grad_rtol=div_grad_rtol,
        inplace=True)


+_good_inv = dict(
+    normal=[5 * rand_nonzero((2, 3))],
+    integers=[randint_nonzero(2, 3)],
+    int8=[numpy.array(range(-127, 0) + range(1, 127), dtype='int8')],
+    complex=[randcomplex_nonzero((2, 3))],
+    empty=[numpy.asarray([], dtype=config.floatX)])
+
+_good_inv_inplace = copymod(_good_inv, without=['integers', 'int8', 'complex'])
+_grad_inv = copymod(_good_inv,
+                    without=['integers', 'int8', 'complex', 'empty'])
+
+_bad_runtime_inv = dict(
+    float=[numpy.zeros((2, 3))],
+    integers=[numpy.zeros((2, 3), dtype='int64')],
+    int8=[numpy.zeros((2, 3), dtype='int8')],
+    complex=[numpy.zeros((2, 3), dtype='complex128')])
+
+
+InvTester = makeBroadcastTester(
+    op=tensor.inv,
+    expected=lambda x: upcast_int8_nfunc(numpy.true_divide)(numpy.int8(1), x),
+    good=_good_inv,
+    bad_runtime=_bad_runtime_inv,
+    grad=_grad_inv,
+    grad_rtol=div_grad_rtol)
+
+InvInplaceTester = makeBroadcastTester(
+    op=inplace.inv_inplace,
+    expected=lambda x: _numpy_true_div(numpy.int8(1), x),
+    good=_good_inv_inplace,
+    bad_runtime=_bad_runtime_inv,
+    grad=_grad_inv,
+    grad_rtol=div_grad_rtol,
+    inplace=True)
+
+
 CeilIntDivTester = makeBroadcastTester(
    op=tensor.ceil_intdiv,
    expected=lambda x, y: check_floatX((x, y), (x // y) + ((x % y) != 0)),
@@ -990,6 +1074,8 @@ _good_broadcast_unary_normal = dict(
        normal=[numpy.asarray(rand_ranged(-5, 5, (2, 3)),
                              dtype=config.floatX)],
        integers=[randint_ranged(-5, 5, (2, 3))],
+        # not using -128 because numpy.allclose would return False
+        int8=[numpy.arange(-127, 128, dtype='int8')],
        corner_case=[corner_case],
        complex=[randcomplex(2, 3)],
        empty=[numpy.asarray([], dtype=config.floatX)],
@@ -998,6 +1084,7 @@ _good_broadcast_unary_normal = dict(
 _good_broadcast_unary_normal_no_complex = dict(
        normal=[numpy.asarray(rand_ranged(-5, 5, (2, 3)), dtype=floatX)],
        integers=[randint_ranged(-5, 5, (2, 3))],
+        int8=[numpy.arange(-127, 128, dtype='int8')],
        corner_case=[corner_case],
        empty=[numpy.asarray([], dtype=config.floatX)],
        )
@@ -1020,6 +1107,8 @@ _grad_broadcast_unary_0_2_no_complex = dict(
        normal=[numpy.asarray(rand_ranged(0, 2, (2, 3)), dtype=floatX)],
        )

+#inplace ops when the input is integer and the output is float*
+# don't have a well defined behavior. We don't test that case.

 AbsTester = makeBroadcastTester(op=tensor.abs_,
                                  expected=lambda x: abs(x),
@@ -1160,112 +1249,123 @@ SqrInplaceTester = makeBroadcastTester(op=inplace.sqr_inplace,
                                       grad=_grad_broadcast_unary_normal,
                                       inplace=True)

-ExpTester = makeBroadcastTester(op=tensor.exp,
-                                expected=numpy.exp,
-                                good=_good_broadcast_unary_normal,
-                                grad=_grad_broadcast_unary_normal)
-ExpInplaceTester = makeBroadcastTester(op=inplace.exp_inplace,
-                                       expected=numpy.exp,
-                                       good=_good_broadcast_unary_normal,
-                                       grad=_grad_broadcast_unary_normal,
-                                       inplace=True)
-
-
-def _numpy_exp2_round_int(x):
-    # Make sure exp2 on an int returns a value that can be correctly casted
-    # to an int. For instance, numpy.exp2(4) sometimes returns
-    # 15.999999999999998, we make sure we return 16. instead.
-    # This is used in Exp2InplaceTester.
-    out = numpy.exp2(x)
-    if x.dtype in tensor.discrete_dtypes:
-        out = numpy.round(out)
-    return out
+ExpTester = makeBroadcastTester(
+    op=tensor.exp,
+    expected=upcast_float16_ufunc(numpy.exp),
+    good=dict(_good_broadcast_unary_normal,
+              int8=[numpy.arange(-127, 89, dtype='int8')]),
+    grad=_grad_broadcast_unary_normal)
+ExpInplaceTester = makeBroadcastTester(
+    op=inplace.exp_inplace,
+    expected=numpy.exp,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)

 Exp2Tester = makeBroadcastTester(op=tensor.exp2,
-                                 expected=numpy.exp2,
+                                 expected=upcast_float16_ufunc(numpy.exp2),
                                 good=_good_broadcast_unary_normal,
                                 grad=_grad_broadcast_unary_normal)
-Exp2InplaceTester = makeBroadcastTester(op=inplace.exp2_inplace,
-                                        expected=_numpy_exp2_round_int,
-                                         good=_good_broadcast_unary_normal,
-                                         grad=_grad_broadcast_unary_normal,
-                                         inplace=True)
+Exp2InplaceTester = makeBroadcastTester(
+    op=inplace.exp2_inplace,
+    expected=numpy.exp2,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)


-Expm1Tester = makeBroadcastTester(op=tensor.expm1,
-                                  expected=numpy.expm1,
-                                  good=_good_broadcast_unary_normal,
-                                  grad=_grad_broadcast_unary_normal)
-Expm1InplaceTester = makeBroadcastTester(op=inplace.expm1_inplace,
-                                         expected=numpy.expm1,
-                                         good=_good_broadcast_unary_normal,
-                                         grad=_grad_broadcast_unary_normal,
-                                         inplace=True)
+Expm1Tester = makeBroadcastTester(
+    op=tensor.expm1,
+    expected=upcast_float16_ufunc(numpy.expm1),
+    good=dict(_good_broadcast_unary_normal,
+              int8=[numpy.arange(-127, 89, dtype='int8')]),
+              grad=_grad_broadcast_unary_normal)
+Expm1InplaceTester = makeBroadcastTester(
+    op=inplace.expm1_inplace,
+    expected=numpy.expm1,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)
+

+_good_broadcast_unary_positive = dict(
+    normal=(rand_ranged(0.001, 5, (2, 3)),),
+    integers=(randint_ranged(1, 5, (2, 3)),),
+    uint8=[numpy.arange(1, 256, dtype='uint8')],
+    complex=(randc128_ranged(1, 5, (2, 3)),),
+    empty=(numpy.asarray([], dtype=config.floatX),),
+)

-_good_broadcast_unary_positive = dict(normal=(rand_ranged(0.001, 5, (2, 3)),),
-                                      integers=(randint_ranged(1, 5, (2, 3)),),
-                                      complex=(randc128_ranged(1, 5, (2, 3)),),
-                                      empty=(numpy.asarray([], dtype=config.floatX),),
-                                      )
+_good_broadcast_unary_positive_float = copymod(
+    _good_broadcast_unary_positive,
+    without=['integers', 'uint8'])

 _grad_broadcast_unary_positive = dict(normal=(rand_ranged(0.001, 5, (2, 3)),),)

 LogTester = makeBroadcastTester(op=tensor.log,
-                                expected=numpy.log,
+                                expected=upcast_float16_ufunc(numpy.log),
                                good=_good_broadcast_unary_positive,
                                grad=_grad_broadcast_unary_positive)
-LogInplaceTester = makeBroadcastTester(op=inplace.log_inplace,
-                                       expected=numpy.log,
-                                       good=_good_broadcast_unary_positive,
-                                       grad=_grad_broadcast_unary_positive,
-                                       inplace=True)
+LogInplaceTester = makeBroadcastTester(
+    op=inplace.log_inplace,
+    expected=numpy.log,
+    good=_good_broadcast_unary_positive_float,
+    grad=_grad_broadcast_unary_positive,
+    inplace=True)

 Log2Tester = makeBroadcastTester(op=tensor.log2,
-                                 expected=numpy.log2,
+                                 expected=upcast_float16_ufunc(numpy.log2),
                                 good=_good_broadcast_unary_positive,
                                 grad=_grad_broadcast_unary_positive)
-Log2InplaceTester = makeBroadcastTester(op=inplace.log2_inplace,
-                                        expected=numpy.log2,
-                                        good=_good_broadcast_unary_positive,
-                                        grad=_grad_broadcast_unary_positive,
-                                        inplace=True)
+Log2InplaceTester = makeBroadcastTester(
+    op=inplace.log2_inplace,
+    expected=numpy.log2,
+    good=_good_broadcast_unary_positive_float,
+    grad=_grad_broadcast_unary_positive,
+    inplace=True)

 Log10Tester = makeBroadcastTester(op=tensor.log10,
-                                  expected=numpy.log10,
+                                  expected=upcast_float16_ufunc(numpy.log10),
                                  good=_good_broadcast_unary_positive,
                                  grad=_grad_broadcast_unary_positive)
-Log10InplaceTester = makeBroadcastTester(op=inplace.log10_inplace,
-                                         expected=numpy.log10,
-                                         good=_good_broadcast_unary_positive,
-                                         grad=_grad_broadcast_unary_positive,
-                                         inplace=True)
+Log10InplaceTester = makeBroadcastTester(
+    op=inplace.log10_inplace,
+    expected=numpy.log10,
+    good=_good_broadcast_unary_positive_float,
+    grad=_grad_broadcast_unary_positive,
+    inplace=True)

 Log1pTester = makeBroadcastTester(op=tensor.log1p,
-                                  expected=numpy.log1p,
+                                  expected=upcast_float16_ufunc(numpy.log1p),
                                  good=_good_broadcast_unary_positive,
                                  grad=_grad_broadcast_unary_positive)
-Log1pInplaceTester = makeBroadcastTester(op=inplace.log1p_inplace,
-                                         expected=numpy.log1p,
-                                         good=_good_broadcast_unary_positive,
-                                         grad=_grad_broadcast_unary_positive,
-                                         inplace=True)
+Log1pInplaceTester = makeBroadcastTester(
+    op=inplace.log1p_inplace,
+    expected=numpy.log1p,
+    good=_good_broadcast_unary_positive_float,
+    grad=_grad_broadcast_unary_positive,
+    inplace=True)

 SqrtTester = makeBroadcastTester(op=tensor.sqrt,
-                                   expected=numpy.sqrt,
+                                   expected=upcast_float16_ufunc(numpy.sqrt),
                                   good=_good_broadcast_unary_positive,
                                   grad=_grad_broadcast_unary_positive)
-SqrtInplaceTester = makeBroadcastTester(op=inplace.sqrt_inplace,
-                                        expected=numpy.sqrt,
-                                        good=_good_broadcast_unary_positive,
-                                        grad=_grad_broadcast_unary_positive,
-                                        inplace=True)
+SqrtInplaceTester = makeBroadcastTester(
+    op=inplace.sqrt_inplace,
+    expected=numpy.sqrt,
+    good=_good_broadcast_unary_positive_float,
+    grad=_grad_broadcast_unary_positive,
+    inplace=True)

 _good_broadcast_unary_wide = dict(
    normal=(rand_ranged(-1000, 1000, (2, 3)),),
    integers=(randint_ranged(-1000, 1000, (2, 3)),),
+    int8=[numpy.arange(-127, 128, dtype='int8')],
    complex=(randc128_ranged(-1000, 1000, (2, 3)),),
    empty=(numpy.asarray([], dtype=config.floatX),),)
+_good_broadcast_unary_wide_float = copymod(
+    _good_broadcast_unary_wide,
+    without=['integers', 'int8'])
 _grad_broadcast_unary_wide = dict(normal=(rand_ranged(-1000, 1000, (2, 3)),),)

 if theano.config.floatX == 'float32':
@@ -1275,82 +1375,92 @@ else:

 Deg2radTester = makeBroadcastTester(
    op=tensor.deg2rad,
-    expected=numpy.deg2rad,
+    expected=upcast_float16_ufunc(numpy.deg2rad),
    good=_good_broadcast_unary_normal_no_complex,
    grad=_grad_broadcast_unary_normal_no_complex,
    eps=angle_eps)
 Deg2radInplaceTester = makeBroadcastTester(
    op=inplace.deg2rad_inplace,
    expected=numpy.deg2rad,
-    good=_good_broadcast_unary_normal_no_complex,
+    good=_good_broadcast_unary_normal_float_no_complex,
    grad=_grad_broadcast_unary_normal_no_complex,
    inplace=True,
    eps=angle_eps)

 Rad2degTester = makeBroadcastTester(
    op=tensor.rad2deg,
-    expected=numpy.rad2deg,
+    expected=upcast_float16_ufunc(numpy.rad2deg),
    good=_good_broadcast_unary_normal_no_complex,
    grad=_grad_broadcast_unary_normal_no_complex,
    eps=angle_eps)
 Rad2degInplaceTester = makeBroadcastTester(
    op=inplace.rad2deg_inplace,
    expected=numpy.rad2deg,
-    good=_good_broadcast_unary_normal_no_complex,
+    good=_good_broadcast_unary_normal_float_no_complex,
    grad=_grad_broadcast_unary_normal_no_complex,
    inplace=True,
    eps=angle_eps)

 SinTester = makeBroadcastTester(op=tensor.sin,
-                                expected=numpy.sin,
+                                expected=upcast_float16_ufunc(numpy.sin),
                                good=_good_broadcast_unary_wide,
                                grad=_grad_broadcast_unary_wide)
-SinInplaceTester = makeBroadcastTester(op=inplace.sin_inplace,
-                                       expected=numpy.sin,
-                                       good=_good_broadcast_unary_wide,
-                                       grad=_grad_broadcast_unary_wide,
-                                       inplace=True)
+SinInplaceTester = makeBroadcastTester(
+    op=inplace.sin_inplace,
+    expected=numpy.sin,
+    good=_good_broadcast_unary_wide_float,
+    grad=_grad_broadcast_unary_wide,
+    inplace=True)

-_good_broadcast_unary_arcsin = dict(normal=(rand_ranged(-1, 1, (2, 3)),),
-                                    integers=(randint_ranged(-1, 1, (2, 3)),),
-                                    complex=(randc128_ranged(-1, 1, (2, 3)),),
-                                    empty=(numpy.asarray([], dtype=config.floatX),),)
+_good_broadcast_unary_arcsin = dict(
+    normal=(rand_ranged(-1, 1, (2, 3)),),
+    integers=(randint_ranged(-1, 1, (2, 3)),),
+    int8=[numpy.arange(-1, 2, dtype='int8')],
+    complex=(randc128_ranged(-1, 1, (2, 3)),),
+    empty=(numpy.asarray([], dtype=config.floatX),),)
+_good_broadcast_unary_arcsin_float = copymod(
+    _good_broadcast_unary_arcsin,
+    without=['integers', 'int8'])
 _grad_broadcast_unary_arcsin = dict(normal=(rand_ranged(-1, 1, (2, 3)),),)

 ArcsinTester = makeBroadcastTester(op=tensor.arcsin,
-                                   expected=numpy.arcsin,
+                                   expected=upcast_float16_ufunc(numpy.arcsin),
                                   good=_good_broadcast_unary_arcsin,
                                   grad=_grad_broadcast_unary_arcsin)
-ArcsinInplaceTester = makeBroadcastTester(op=inplace.arcsin_inplace,
-                                          expected=numpy.arcsin,
-                                          good=_good_broadcast_unary_arcsin,
-                                          grad=_grad_broadcast_unary_arcsin,
-                                          inplace=True)
+ArcsinInplaceTester = makeBroadcastTester(
+    op=inplace.arcsin_inplace,
+    expected=numpy.arcsin,
+    good=_good_broadcast_unary_arcsin_float,
+    grad=_grad_broadcast_unary_arcsin,
+    inplace=True)

 CosTester = makeBroadcastTester(op=tensor.cos,
-                                expected=numpy.cos,
+                                expected=upcast_float16_ufunc(numpy.cos),
                                good=_good_broadcast_unary_wide,
                                grad=_grad_broadcast_unary_wide)
-CosInplaceTester = makeBroadcastTester(op=inplace.cos_inplace,
-                                       expected=numpy.cos,
-                                       good=_good_broadcast_unary_wide,
-                                       grad=_grad_broadcast_unary_wide,
-                                       inplace=True)
+CosInplaceTester = makeBroadcastTester(
+    op=inplace.cos_inplace,
+    expected=numpy.cos,
+    good=_good_broadcast_unary_wide_float,
+    grad=_grad_broadcast_unary_wide,
+    inplace=True)

 ArccosTester = makeBroadcastTester(op=tensor.arccos,
-                                   expected=numpy.arccos,
+                                   expected=upcast_float16_ufunc(numpy.arccos),
                                   good=_good_broadcast_unary_arcsin,
                                   grad=_grad_broadcast_unary_arcsin)
-ArccosInplaceTester = makeBroadcastTester(op=inplace.arccos_inplace,
-                                          expected=numpy.arccos,
-                                          good=_good_broadcast_unary_arcsin,
-                                          grad=_grad_broadcast_unary_arcsin,
-                                          inplace=True)
+ArccosInplaceTester = makeBroadcastTester(
+    op=inplace.arccos_inplace,
+    expected=numpy.arccos,
+    good=_good_broadcast_unary_arcsin_float,
+    grad=_grad_broadcast_unary_arcsin,
+    inplace=True)

 _good_broadcast_unary_tan = dict(
    normal=(rand_ranged(-3.14, 3.14, (2, 3)),),
    shifted=(rand_ranged(3.15, 6.28, (2, 3)),),
    integers=(randint_ranged(-3, 3, (2, 3)),),
+    int8=[numpy.arange(-3, 4, dtype='int8')],
    complex=(randc128_ranged(-3.14, 3.14, (2, 3)),),
    empty=(numpy.asarray([], dtype=config.floatX),),)
 #We do not want to test around the discontinuity.
@@ -1358,25 +1468,27 @@ _grad_broadcast_unary_tan = dict(normal=(rand_ranged(-1.5, 1.5, (2, 3)),),
                                 shifted=(rand_ranged(1.6, 4.6, (2, 3)),))

 TanTester = makeBroadcastTester(op=tensor.tan,
-                                expected=numpy.tan,
+                                expected=upcast_float16_ufunc(numpy.tan),
                                good=_good_broadcast_unary_tan,
                                grad=_grad_broadcast_unary_tan)

-TanInplaceTester = makeBroadcastTester(op=inplace.tan_inplace,
-                                       expected=numpy.tan,
-                                       good=_good_broadcast_unary_tan,
-                                       grad=_grad_broadcast_unary_tan,
-                                       inplace=True)
+TanInplaceTester = makeBroadcastTester(
+    op=inplace.tan_inplace,
+    expected=numpy.tan,
+    good=copymod(_good_broadcast_unary_tan, without=['integers', 'int8']),
+    grad=_grad_broadcast_unary_tan,
+    inplace=True)

 ArctanTester = makeBroadcastTester(op=tensor.arctan,
-                                   expected=numpy.arctan,
+                                   expected=upcast_float16_ufunc(numpy.arctan),
                                   good=_good_broadcast_unary_wide,
                                   grad=_grad_broadcast_unary_wide)
-ArctanInplaceTester = makeBroadcastTester(op=inplace.arctan_inplace,
-                                          expected=numpy.arctan,
-                                          good=_good_broadcast_unary_wide,
-                                          grad=_grad_broadcast_unary_wide,
-                                          inplace=True)
+ArctanInplaceTester = makeBroadcastTester(
+    op=inplace.arctan_inplace,
+    expected=numpy.arctan,
+    good=_good_broadcast_unary_wide_float,
+    grad=_grad_broadcast_unary_wide,
+    inplace=True)

 _good_broadcast_binary_arctan2 = dict(
    same_shapes=(rand(2, 3), rand(2, 3)),
@@ -1385,6 +1497,8 @@ _good_broadcast_binary_arctan2 = dict(
    row=(rand(2, 3), rand(1, 3)),
    column=(rand(2, 3), rand(2, 1)),
    integers=(randint(2, 3), randint(2, 3)),
+    int8=[numpy.arange(-127, 128, dtype='int8'),
+          numpy.arange(-127, 128, dtype='int8')[:, numpy.newaxis]],
    dtype_mixup_1=(rand(2, 3), randint(2, 3)),
    dtype_mixup_2=(randint(2, 3), rand(2, 3)),
    empty=(numpy.asarray([], dtype=config.floatX),
@@ -1398,100 +1512,110 @@ _grad_broadcast_binary_arctan2 = dict(
    column=(rand(2, 3), rand(2, 1)),
    )

-Arctan2Tester = makeBroadcastTester(op=tensor.arctan2,
-                                    expected=numpy.arctan2,
-                                    good=_good_broadcast_binary_arctan2,
-                                    grad=_grad_broadcast_binary_arctan2)
-Arctan2InplaceTester = makeBroadcastTester(op=inplace.arctan2_inplace,
-                                           expected=numpy.arctan2,
-                                           good=_good_broadcast_binary_arctan2,
-                                           grad=_grad_broadcast_binary_arctan2,
-                                           inplace=True)
-
-CoshTester = makeBroadcastTester(op=tensor.cosh,
-                                 expected=numpy.cosh,
-                                 good=_good_broadcast_unary_normal,
-                                 grad=_grad_broadcast_unary_normal)
-CoshInplaceTester = makeBroadcastTester(op=inplace.cosh_inplace,
-                                        expected=numpy.cosh,
-                                        good=_good_broadcast_unary_normal,
-                                        grad=_grad_broadcast_unary_normal,
-                                        inplace=True)
+Arctan2Tester = makeBroadcastTester(
+    op=tensor.arctan2,
+    expected=upcast_float16_ufunc(numpy.arctan2),
+    good=_good_broadcast_binary_arctan2,
+    grad=_grad_broadcast_binary_arctan2)
+Arctan2InplaceTester = makeBroadcastTester(
+    op=inplace.arctan2_inplace,
+    expected=numpy.arctan2,
+    good=copymod(_good_broadcast_binary_arctan2, without=['integers', 'int8']),
+    grad=_grad_broadcast_binary_arctan2,
+    inplace=True)
+
+CoshTester = makeBroadcastTester(
+    op=tensor.cosh,
+    expected=upcast_float16_ufunc(numpy.cosh),
+    good=dict(_good_broadcast_unary_normal,
+              int8=[numpy.arange(-89, 90, dtype='int8')]),
+    grad=_grad_broadcast_unary_normal)
+CoshInplaceTester = makeBroadcastTester(
+    op=inplace.cosh_inplace,
+    expected=numpy.cosh,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)

 _good_broadcast_unary_arccosh = dict(
    normal=(rand_ranged(1, 1000, (2, 3)),),
    integers=(randint_ranged(1, 1000, (2, 3)),),
+    uint8=[numpy.arange(1, 256, dtype='uint8')],
    complex=(randc128_ranged(1, 1000, (2, 3)),),
    empty=(numpy.asarray([], dtype=config.floatX),),)
 _grad_broadcast_unary_arccosh = dict(normal=(rand_ranged(1, 1000, (2, 3)),),)

-ArccoshTester = makeBroadcastTester(op=tensor.arccosh,
-                                    expected=numpy.arccosh,
-                                    good=_good_broadcast_unary_arccosh,
-                                    grad=_grad_broadcast_unary_arccosh)
-ArccoshInplaceTester = makeBroadcastTester(op=inplace.arccosh_inplace,
-                                           expected=numpy.arccosh,
-                                           good=_good_broadcast_unary_arccosh,
-                                           grad=_grad_broadcast_unary_arccosh,
-                                           inplace=True)
-
-SinhTester = makeBroadcastTester(op=tensor.sinh,
-                                 expected=numpy.sinh,
-                                 good=_good_broadcast_unary_normal,
-                                 grad=_grad_broadcast_unary_normal)
-SinhInplaceTester = makeBroadcastTester(op=inplace.sinh_inplace,
-                                        expected=numpy.sinh,
-                                        good=_good_broadcast_unary_normal,
-                                        grad=_grad_broadcast_unary_normal,
-                                        inplace=True)
-
-ArcsinhTester = makeBroadcastTester(op=tensor.arcsinh,
-                                    expected=numpy.arcsinh,
-                                    good=_good_broadcast_unary_normal,
-                                    grad=_grad_broadcast_unary_normal)
-ArcsinhInplaceTester = makeBroadcastTester(op=inplace.arcsinh_inplace,
-                                           expected=numpy.arcsinh,
-                                           good=_good_broadcast_unary_normal,
-                                           grad=_grad_broadcast_unary_normal,
-                                           inplace=True)
+ArccoshTester = makeBroadcastTester(
+    op=tensor.arccosh,
+    expected=upcast_float16_ufunc(numpy.arccosh),
+    good=_good_broadcast_unary_arccosh,
+    grad=_grad_broadcast_unary_arccosh)
+ArccoshInplaceTester = makeBroadcastTester(
+    op=inplace.arccosh_inplace,
+    expected=numpy.arccosh,
+    good=copymod(_good_broadcast_unary_arccosh, without=['integers', 'uint8']),
+    grad=_grad_broadcast_unary_arccosh,
+    inplace=True)
+
+SinhTester = makeBroadcastTester(
+    op=tensor.sinh,
+    expected=upcast_float16_ufunc(numpy.sinh),
+    good=dict(_good_broadcast_unary_normal,
+              int8=[numpy.arange(-89, 90, dtype='int8')]),
+              grad=_grad_broadcast_unary_normal)
+SinhInplaceTester = makeBroadcastTester(
+    op=inplace.sinh_inplace,
+    expected=numpy.sinh,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)
+
+ArcsinhTester = makeBroadcastTester(
+    op=tensor.arcsinh,
+    expected=upcast_float16_ufunc(numpy.arcsinh),
+    good=_good_broadcast_unary_normal,
+    grad=_grad_broadcast_unary_normal)
+ArcsinhInplaceTester = makeBroadcastTester(
+    op=inplace.arcsinh_inplace,
+    expected=numpy.arcsinh,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)

 TanhTester = makeBroadcastTester(op=tensor.tanh,
-                                 expected=numpy.tanh,
+                                 expected=upcast_float16_ufunc(numpy.tanh),
                                 good=_good_broadcast_unary_normal,
                                 grad=_grad_broadcast_unary_normal)
-TanhInplaceTester = makeBroadcastTester(op=inplace.tanh_inplace,
-                                        expected=numpy.tanh,
-                                        good=_good_broadcast_unary_normal,
-                                        grad=_grad_broadcast_unary_normal,
-                                        inplace=True)
+TanhInplaceTester = makeBroadcastTester(
+    op=inplace.tanh_inplace,
+    expected=numpy.tanh,
+    good=_good_broadcast_unary_normal_float,
+    grad=_grad_broadcast_unary_normal,
+    inplace=True)

 _eps = 1e-10
 _good_broadcast_unary_arctanh = dict(
    normal=(rand_ranged(-1 + _eps, 1 - _eps, (2, 3)),),
    integers=(randint_ranged(-1 + _eps, 1 - _eps, (2, 3)),),
+    int8=[numpy.arange(0, 1, dtype='int8')],
    complex=(randc128_ranged(-1 + _eps, 1 - _eps, (2, 3)),),
    empty=(numpy.asarray([], dtype=config.floatX),),)
 _grad_broadcast_unary_arctanh = dict(
    normal=(rand_ranged(-1 + _eps, 1 - _eps, (2, 3)),),)

-ArctanhTester = makeBroadcastTester(op=tensor.arctanh,
-                                    expected=numpy.arctanh,
-                                    good=_good_broadcast_unary_arctanh,
-                                    grad=_grad_broadcast_unary_arctanh)
-ArctanhInplaceTester = makeBroadcastTester(op=inplace.arctanh_inplace,
-                                           expected=numpy.arctanh,
-                                           good=_good_broadcast_unary_arctanh,
-                                           grad=_grad_broadcast_unary_arctanh,
-                                           inplace=True)
+ArctanhTester = makeBroadcastTester(
+    op=tensor.arctanh,
+    expected=upcast_float16_ufunc(numpy.arctanh),
+    good=_good_broadcast_unary_arctanh,
+    grad=_grad_broadcast_unary_arctanh)
+ArctanhInplaceTester = makeBroadcastTester(
+    op=inplace.arctanh_inplace,
+    expected=numpy.arctanh,
+    good=copymod(_good_broadcast_unary_arctanh, without=['integers', 'int8']),
+    grad=_grad_broadcast_unary_arctanh,
+    inplace=True)


-#inplace ops when the input is integer and the output is float*
-# don't have a well defined behavior. We don't test that case.
-_good_broadcast_unary_normal_no_int_no_complex = _good_broadcast_unary_normal_no_complex.copy()
-del _good_broadcast_unary_normal_no_int_no_complex['integers']
-_good_broadcast_unary_normal_no_int = _good_broadcast_unary_normal.copy()
-del _good_broadcast_unary_normal_no_int['integers']
-
 # We can't test it if scipy is not installed!
 # Precomputing the result is brittle(it have been broken!)
 # As if we do any modification to random number here,
@@ -1528,7 +1652,7 @@ ErfTester = makeBroadcastTester(
 ErfInplaceTester = makeBroadcastTester(
    op=inplace.erf_inplace,
    expected=expected_erf,
-    good=_good_broadcast_unary_normal_no_int,
+    good=_good_broadcast_unary_normal_float,
    grad=_grad_broadcast_unary_normal,
    mode=mode_no_scipy,
    eps=2e-10,
@@ -1538,7 +1662,7 @@ ErfInplaceTester = makeBroadcastTester(
 ErfcTester = makeBroadcastTester(
    op=tensor.erfc,
    expected=expected_erfc,
-    good=_good_broadcast_unary_normal_no_int_no_complex,
+    good=_good_broadcast_unary_normal_float_no_complex,
    grad=_grad_broadcast_unary_normal,
    eps=2e-10,
    mode=mode_no_scipy,
@@ -1546,7 +1670,7 @@ ErfcTester = makeBroadcastTester(
 ErfcInplaceTester = makeBroadcastTester(
    op=inplace.erfc_inplace,
    expected=expected_erfc,
-    good=_good_broadcast_unary_normal_no_int_no_complex,
+    good=_good_broadcast_unary_normal_float_no_complex,
    grad=_grad_broadcast_unary_normal,
    eps=2e-10,
    mode=mode_no_scipy,
@@ -1556,7 +1680,7 @@ ErfcInplaceTester = makeBroadcastTester(
 ErfinvTester = makeBroadcastTester(
    op=tensor.erfinv,
    expected=expected_erfinv,
-    good=_good_broadcast_unary_normal_no_int_no_complex,
+    good=_good_broadcast_unary_normal_float_no_complex,
    grad=_grad_broadcast_unary_abs1_no_complex,
    eps=2e-10,
    mode=mode_no_scipy,
@@ -1565,7 +1689,7 @@ ErfinvTester = makeBroadcastTester(
 ErfcinvTester = makeBroadcastTester(
    op=tensor.erfcinv,
    expected=expected_erfcinv,
-    good=_good_broadcast_unary_normal_no_int_no_complex,
+    good=_good_broadcast_unary_normal_float_no_complex,
    grad=_grad_broadcast_unary_0_2_no_complex,
    eps=2e-10,
    mode=mode_no_scipy,