Merge pull request #5164 from abergeron/dlt_f16_2

Fix some problems in float16.

Merge pull request #5164 from abergeron/dlt_f16_2
edd1c456 · Frédéric Bastien · GitHub · 0477d635 · f9bf5139 · edd1c456
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -275,6 +275,7 @@ class GpuDot22(BlasOp):
    Dot22 on the GPU.
    """
+    _f16_ok = True
    __props__ = ()
    def make_node(self, x, y):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1134,27 +1134,6 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs):
    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
-@register_opt('fast_compile')
-@op_lifter([tensor.basic.Dot])
-@register_opt2([tensor.basic.Dot], 'fast_compile')
-def local_gpua_hgemm(op, context_name, inputs, outputs):
-    from theano.sandbox.cuda import nvcc_compiler
-    if nvcc_compiler.nvcc_version < '7.5':
-        _logger.warning("Not performing dot of float16 on the GPU since "
-                        "cuda 7.5 is not available. Updating could speed up "
-                        "your code.")
-        return
-    A = inputs[0]
-    B = inputs[1]
-    if (A.ndim == 2 and B.ndim == 2 and
-            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = outputs[0].fgraph
-        C = gpu_alloc_empty(context_name, dtype='float16')(
-            shape_i(A, 0, fgraph),
-            shape_i(B, 1, fgraph))
-        return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
 @register_opt()
 @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
 def local_gpua_gemm_alpha_merge(node, *inputs):

--- a/theano/gpuarray/tests/test_blas.py
+++ b/theano/gpuarray/tests/test_blas.py
@@ -3,8 +3,6 @@ from unittest import TestCase
 from nose.plugins.skip import SkipTest
 import itertools
-import numpy
 import theano
 from theano import tensor
 from theano.tests import unittest_tools as utt
@@ -18,7 +16,7 @@ from .test_basic_ops import makeTester, rand
 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace, gpugemmbatch_no_inplace,
                    gpuger_inplace, gpuger_no_inplace,
-                    GpuGer, gpu_dot22, GpuGemm)
+                    GpuGer, gpu_dot22)
 GpuGemvTester = makeTester(
@@ -130,52 +128,3 @@ GpuDot22Tester = makeTester(
        # test9=[rand(0, 0), rand(0, 0)],
    )
 )
-def test_hgemm_swap():
-    from theano.sandbox.cuda import nvcc_compiler
-    if nvcc_compiler.nvcc_version < '7.5':
-        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
-    v = tensor.vector(dtype='float16')
-    m = tensor.matrix(dtype='float16')
-    m2 = tensor.matrix(dtype='float16')
-    m32 = tensor.matrix(dtype='float32')
-    # test that we don't try to replace anything but matrix x matrix in float16
-    f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuGemm)]) == 0
-    f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuGemm)]) == 0
-    f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuGemm)]) == 1
-    v1 = numpy.random.random((3, 4)).astype('float16')
-    v2 = numpy.random.random((4, 2)).astype('float16')
-    of = f(v1, v2)
-    on = numpy.dot(v1, v2)
-    utt.assert_allclose(of, on)
-def test_hgemm_alpha_output_merge():
-    from theano.sandbox.cuda import nvcc_compiler
-    if nvcc_compiler.nvcc_version < '7.5':
-        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
-    m1 = tensor.matrix(dtype='float16')
-    m2 = tensor.matrix(dtype='float16')
-    b = tensor.matrix(dtype='float16')
-    hgemm = numpy.asarray(0.05, dtype='float16') * (tensor.dot(m1, m2) + b)
-    f = theano.function([m1, m2, b], hgemm, mode=mode_with_gpu)
-    # there should be 3 gpu_from_host, 1 hgemm and 1 host_from_gpu
-    assert len(f.maker.fgraph.apply_nodes) == 5
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -18,6 +18,7 @@ from copy import copy
 from textwrap import dedent
 import numpy
+import six
 from six.moves import xrange
 import theano
@@ -121,33 +122,165 @@ def as_scalar(x, name=None):
        raise TypeError("Cannot convert %s to Scalar" % x, type(x))
-def constant(x):
+class NumpyAutocaster(object):
-    # pass through numpy scalars, since they are already typed on
+    """
+    This class is used to cast python ints and floats to numpy arrays.
+    The behavior when called on scalar `x` depends on `config.cast_policy`:
+        - 'numpy' will simply use the same type as found by `numpy.asarray(x)`.
+        - 'numpy+floatX' will do the same, except it will use float32 instead
+          of float64 if `x` is a Python float and `config.floatX` is set to
+          'float32' (note that if `x` is a numpy scalar whose data type is
+          float64, it is not modified since we assume the user is purposedly
+          using float64).
+        - 'custom' lets one define a tuple of data types such that:
+            - if `x` is already a numpy scalar and its data type is in this
+              tuple, then it is returned unchanged;
+            - otherwise, the first data type in this tuple that can represent
+              `x` without loss of precision will be used, unless `x` is a float
+              and 'float32' is in the tuple (in which case `x` is cast as a
+              float32);
+            - if no data type can represent `x` without loss of precision, then
+              the last data type in the tuple will be used.
+    Parameters
+    ----------
+    dtypes: tuple of strings
+        The ordered list of preferred data types (only used when
+        `config.cast_policy` is set to 'custom', see the `NumpyAutocaster`
+        help for details).
+    """
+    def __init__(self, dtypes):
+        self.dtypes = tuple(dtypes)
+    def __call__(self, x):
+        # Make sure we only deal with scalars.
+        assert (isinstance(x, six.integer_types) or
+                isinstance(x, builtin_float) or
+                (isinstance(x, numpy.ndarray) and x.ndim == 0))
+        if config.cast_policy == 'numpy':
+            return numpy.asarray(x)
+        elif config.cast_policy == 'numpy+floatX':
+            rval = numpy.asarray(x)
+            if ((not hasattr(x, 'dtype') and
+                 rval.dtype in ('float64', 'float32') and
+                 rval.dtype != config.floatX)):
+                rval = theano._asarray(rval, dtype=config.floatX)
+            return rval
+        # The following is the original code, corresponding to the 'custom'
+        # option for `config.cast_policy`.
+        assert config.cast_policy == 'custom'
+        try:
+            # Pass through numpy scalars, since they are already typed on
            # purpose typically.
-    if hasattr(x, 'dtype'):
+            if str(x.dtype) in self.dtypes:
-        assert x.ndim == 0
+                # No need to cast `x` into a new dtype. Note that we still
-        return ScalarConstant(get_scalar_type(str(x.dtype)), x)
+                # need to convert it into an array, because it may not be
-    if isinstance(x, builtin_float):
+                # one already (e.g. if x == numpy.float64(1.1)).
-        for dtype in ['float32', 'float64']:
+                return numpy.asarray(x)
+        except AttributeError:
+            # Means `x` has no 'dtype' attribute.
+            pass
+        # unsafe downcast of float64 variables when config.floatX == 'float32'
+        # recall: float is numpy.float
+        if ((isinstance(x, float) and
+             config.floatX in self.dtypes and
+             config.floatX != 'float64')):
+            return theano._asarray(x, dtype=config.floatX)
+        # Don't autocast to float16 unless config.floatX is float16
+        try_dtypes = [d for d in self.dtypes
+                      if config.floatX == 'float16' or d != 'float16']
+        for dtype in try_dtypes:
            x_ = theano._asarray(x, dtype=dtype)
            if numpy.all(x == x_):
                break
-            x_ = None
+        # returns either an exact x_==x, or the last cast x_
-        assert x_ is not None
+        return x_
-        return ScalarConstant(get_scalar_type(str(x_.dtype)), x)
-    if isinstance(x, builtin_int):
+autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
-        for dtype in ['int8', 'int16', 'int32', 'int64']:
+# autocast_float dtypes might be manipulated in tensor.*
+autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
+class autocast_float_as(object):
+    """
+    Temporarily adjust autocasting behavior.
+    This class makes it possible to temporarily and locally adjust autocasting
+    behavior when `config.cast_policy` is set to 'custom'.
+    If `config.cast_policy` is not 'custom', an exception is raised.
+    This class might be convenient in some code, but it definitely
+    helps to test the autocasting mechanism.
+    Examples
+    --------
+    >>> with autocast_float_as('float32'):
+    ...    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
+    >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
+    """
+    def __init__(self, *dtypes):
+        self.dtypes = dtypes
+        assert config.cast_policy == 'custom'
+    def __enter__(self):
+        assert config.cast_policy == 'custom'
+        self.old_dtypes = autocast_float.dtypes
+        autocast_float.dtypes = self.dtypes
+    def __exit__(self, *args):
+        assert config.cast_policy == 'custom'
+        autocast_float.dtypes = self.old_dtypes
+def convert(x, dtype=None):
+    """
+    Convert the input to a properly typed numpy value according to the
+    current casting policy.  Work with scalars and tensors.
+    """
+    if dtype is not None:
+        # in this case, the semantics are that the caller is forcing the dtype
        x_ = theano._asarray(x, dtype=dtype)
-            if numpy.all(x == x_):
+    else:
-                break
+        # In this case, this function should infer the dtype according to the
+        # autocasting rules. See autocasting above.
        x_ = None
-        assert x_ is not None
+        if isinstance(x, six.integer_types):
-        return ScalarConstant(get_scalar_type(str(x_.dtype)), x)
+            try:
-    if isinstance(x, builtin_complex):
+                x_ = autocast_int(x)
-        # TODO: We have added the complex type, so this should be tested
+            except OverflowError:
-        raise NotImplementedError()
+                # This is to imitate numpy behavior which tries to fit
-    raise TypeError(x)
+                # bigger numbers into a uint64.
-    # return ScalarConstant(float64, float(x))
+                x_ = theano._asarray(x, dtype='uint64')
+        elif isinstance(x, builtin_float):
+            x_ = autocast_float(x)
+        elif isinstance(x, numpy.ndarray):
+            x_ = x
+        else:
+            # Here x is probably a list or a tuple. If it contains a
+            # long, we will behave like the current NumPy version: it
+            # will work if the long fits in int64 or uint64.
+            x_ = numpy.asarray(x)
+            if x_.size == 0 and not hasattr(x, 'dtype'):
+                x_ = numpy.asarray(x, dtype=config.floatX)
+    assert type(x_) in [numpy.ndarray, numpy.memmap]
+    return x_
+def constant(x):
+    x = convert(x)
+    assert x.ndim == 0
+    return ScalarConstant(get_scalar_type(str(x.dtype)), x)
 class Scalar(Type):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -219,138 +219,6 @@ _as_tensor_variable = as_tensor_variable
 as_tensor = as_tensor_variable
-class NumpyAutocaster(object):
-    """
-    This class is used to cast python ints and floats to numpy arrays.
-    The behavior when called on scalar `x` depends on `config.cast_policy`:
-        - 'numpy' will simply use the same type as found by `numpy.asarray(x)`.
-        - 'numpy+floatX' will do the same, except it will use float32 instead
-          of float64 if `x` is a Python float and `config.floatX` is set to
-          'float32' (note that if `x` is a numpy scalar whose data type is
-          float64, it is not modified since we assume the user is purposedly
-          using float64).
-        - 'custom' lets one define a tuple of data types such that:
-            - if `x` is already a numpy scalar and its data type is in this
-              tuple, then it is returned unchanged;
-            - otherwise, the first data type in this tuple that can represent
-              `x` without loss of precision will be used, unless `x` is a float
-              and 'float32' is in the tuple (in which case `x` is cast as a
-              float32);
-            - if no data type can represent `x` without loss of precision, then
-              the last data type in the tuple will be used.
-    Parameters
-    ----------
-    dtypes: tuple of strings
-        The ordered list of preferred data types (only used when
-        `config.cast_policy` is set to 'custom', see the `NumpyAutocaster`
-        help for details).
-    """
-    def __init__(self, dtypes):
-        self.dtypes = tuple(dtypes)
-    def __call__(self, x):
-        # Make sure we only deal with scalars.
-        assert (isinstance(x, integer_types) or
-                isinstance(x, float) or
-                (isinstance(x, numpy.ndarray) and x.ndim == 0))
-        if config.cast_policy == 'numpy':
-            return numpy.asarray(x)
-        elif config.cast_policy == 'numpy+floatX':
-            rval = numpy.asarray(x)
-            if ((not hasattr(x, 'dtype') and
-                 rval.dtype in ('float64', 'float32') and
-                 rval.dtype != config.floatX)):
-                rval = theano._asarray(rval, dtype=config.floatX)
-            return rval
-        # The following is the original code, corresponding to the 'custom'
-        # option for `config.cast_policy`.
-        assert config.cast_policy == 'custom'
-        try:
-            # Pass through numpy scalars, since they are already typed on
-            # purpose typically.
-            if str(x.dtype) in self.dtypes:
-                # No need to cast `x` into a new dtype. Note that we still
-                # need to convert it into an array, because it may not be
-                # one already (e.g. if x == numpy.float64(1.1)).
-                return numpy.asarray(x)
-        except AttributeError:
-            # Means `x` has no 'dtype' attribute.
-            pass
-        # unsafe downcast of float64 variables when config.floatX == 'float32'
-        # recall: float is numpy.float
-        if ((isinstance(x, float) and
-             config.floatX in self.dtypes and
-             config.floatX != 'float64')):
-            return theano._asarray(x, dtype=config.floatX)
-        # Don't autocast to float16 unless config.floatX is float16
-        try_dtypes = [d for d in self.dtypes
-                      if config.floatX == 'float16' or d != 'float16']
-        for dtype in try_dtypes:
-            x_ = theano._asarray(x, dtype=dtype)
-            if numpy.all(x == x_):
-                break
-        # returns either an exact x_==x, or the last cast x_
-        return x_
-autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
-autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
-# autocast_float dtypes might be manipulated in tensor.__init__
-#
-# Note: it's a bit weird for a compiler to automatically downcast
-# literals like this, and it might have implications for efficiency
-# when mixing types.  For example when you add 1.0 + dmatrix(), the
-# 1.0 could be converted to float32, and require upcasting for the +
-# operation at every position in the dmatrix.  using
-# theano._asarray(1.0, dtype='float64') will circumvent this
-# autocasting, and in future, our ops might be smarter about factoring
-# out upcasts.  The advantage of this mechanism is to combine it with
-# floatX so that 1.0 + xmatrix() will always have the same type as the
-# xmatrix().
-#
-class autocast_float_as(object):
-    """
-    Temporarily adjust autocasting behavior.
-    This class makes it possible to temporarily and locally adjust autocasting
-    behavior when `config.cast_policy` is set to 'custom'.
-    If `config.cast_policy` is not 'custom', an exception is raised.
-    This class might be convenient in some code, but it definitely
-    helps to test the autocasting mechanism.
-    Examples
-    --------
-    >>> with autocast_float_as('float32'):
-    ...    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
-    >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
-    """
-    def __init__(self, *dtypes):
-        self.dtypes = dtypes
-        assert config.cast_policy == 'custom'
-    def __enter__(self):
-        assert config.cast_policy == 'custom'
-        self.old_dtypes = autocast_float.dtypes
-        autocast_float.dtypes = self.dtypes
-    def __exit__(self, *args):
-        assert config.cast_policy == 'custom'
-        autocast_float.dtypes = self.old_dtypes
 def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
    """Return a symbolic `Constant` with value `x`.
@@ -362,32 +230,7 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
        `x` could not be expanded to have ndim dimensions.
    """
-    if dtype is not None:
+    x_ = scal.convert(x, dtype=dtype)
-        # in this case, the semantics are that the caller is forcing the dtype
-        x_ = theano._asarray(x, dtype=dtype)
-    else:
-        # In this case, this function should infer the dtype according to the
-        # autocasting rules. See autocasting above.
-        x_ = None
-        if rtype is TensorConstant and isinstance(x, integer_types):
-            try:
-                x_ = autocast_int(x)
-            except OverflowError:
-                # This is to imitate numpy behavior which tries to fit
-                # bigger numbers into a uint64.
-                x_ = theano._asarray(x, dtype='uint64')
-        elif rtype is TensorConstant and isinstance(x, float):
-            x_ = autocast_float(x)
-        elif isinstance(x, numpy.ndarray):
-            x_ = x
-        else:
-            # Here x is probably a list or a tuple. If it contains a
-            # long, we will behave like the current NumPy version: it
-            # will work if the long fits in int64 or uint64.
-            x_ = numpy.asarray(x)
-            if x_.size == 0 and not hasattr(x, 'dtype'):
-                x_ = numpy.asarray(x, dtype=config.floatX)
-    assert type(x_) in [numpy.ndarray, numpy.memmap]
    bcastable = [d == 1 for d in x_.shape]
    if ndim is not None:
@@ -3155,11 +2998,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
        sum_dtype = dtype
    else:
        sum_dtype = None
+        # float16 overflows on the cast way too often
-    # float16 overflows way too fast for sum
+        if input.dtype == 'float16':
-    if ((sum_dtype == 'float16' or input.dtype == 'float16') and
+            sum_dtype = 'float32'
-            acc_dtype != 'float16'):
-        sum_dtype == 'float32'
    s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
            acc_dtype=acc_dtype)

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1093,14 +1093,14 @@ def _as_scalar(res, dtype=None):
 def _is_real_matrix(res):
-    return (res.type.dtype in ('float32', 'float64') and
+    return (res.type.dtype in ('float16', 'float32', 'float64') and
            res.type.ndim == 2 and
            res.type.broadcastable[0] is False and
            res.type.broadcastable[1] is False)  # cope with tuple vs. list
 def _is_real_vector(res):
-    return (res.type.dtype in ('float32', 'float64') and
+    return (res.type.dtype in ('float16', 'float32', 'float64') and
            res.type.ndim == 1 and
            res.type.broadcastable[0] is False)
@@ -1195,7 +1195,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
        return None
    if ((r.type.ndim not in (1, 2)) or
-            r.type.dtype not in ('float32', 'float64',
+            r.type.dtype not in ('float16', 'float32', 'float64',
                                 'complex64', 'complex128')):
        rval.append(scaled(r))
        return rval
@@ -1528,7 +1528,7 @@ class Dot22(GemmRelated):
    """
    def make_node(self, x, y):
-        dtypes = ('float32', 'float64', 'complex64', 'complex128')
+        dtypes = ('float16', 'float32', 'float64', 'complex64', 'complex128')
        if x.type.ndim != 2 or x.type.dtype not in dtypes:
            raise TypeError(x)
        if y.type.ndim != 2 or y.type.dtype not in dtypes:
@@ -1621,7 +1621,7 @@ def local_dot_to_dot22(node):
                     x, y, x.type, y.type)
        return
-    if y.type.dtype in ['float32', 'float64', 'complex64', 'complex128']:
+    if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
        if x.ndim == 2 and y.ndim == 2:
            # print "local_dot_to_dot22: MM"
            return [_dot22(*node.inputs)]

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -26,11 +26,12 @@ from six.moves import StringIO, reduce
 from theano import compile, config, function, gof, tensor, shared
 from theano.compile import DeepCopyOp
 from theano.compile.mode import get_default_mode
-from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
+from theano.scalar import autocast_float_as, autocast_float
+from theano.tensor import (_shared, wvector, bvector,
        argmin, max_and_argmax, cscalar, ctensor3, join,
        horizontal_stack, vertical_stack, argmax, get_vector_length,
        fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
-        alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float,
+        alloc, as_tensor_variable, tensor_from_scalar, ARange,
        clip, constant, default, dot, batched_dot,
        dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
        tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
@@ -4595,6 +4596,12 @@ class T_mean(unittest.TestCase):
        except AttributeError:
            self.fail()
+    def test_mean_f16(self):
+        x = tensor.vector(dtype='float16')
+        y = x.mean()
+        f = theano.function([x], y)
+        utt.assert_allclose(f(numpy.ones((100000,), dtype='float16')), 1.0)
    def test0(self):
        # Simple test...
        x = tensor.vector()