Add the float16 dtype and make sure it isn't lost to casting.

234ffeab · Arnaud Bergeron · fd7655aa · 234ffeab · 234ffeab · 234ffeab
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -142,7 +142,7 @@ class GpuElemwise(HideC, Elemwise):
                code.append('ga_float %s;' % (f[0],))
            # XXX: The replace is an ugly hack to make sure temp
            # variables inthe middle are float32
-            code.append(kop.replace('npy_uint16', 'ga_float'))
+            code.append(kop.replace('npy_float16', 'ga_float'))
            for f in scal_f16:
                code.append('%s[i] = __float2half_rn(%s);' % (f[1].name, f[0]))
            code.append('}')
@@ -195,6 +195,7 @@ class GpuElemwise(HideC, Elemwise):
                        ("npy_int16", "ga_short"),
                        ("npy_int32", "ga_int"),
                        ("npy_int64", "ga_long"),
+                        ("npy_float16", "ga_half"),
                        ("npy_float32", "ga_float"),
                        ("npy_float64", "ga_double"),
            ]:

--- a/theano/sandbox/gpuarray/fp16_help.py
+++ b/theano/sandbox/gpuarray/fp16_help.py
+from theano import scalar
 def work_dtype(dtype):
    if dtype == 'float16':
@@ -5,14 +7,21 @@ def work_dtype(dtype):
    else:
        return dtype
 def load_w(dtype):
    if dtype == 'float16':
        return '__half2float'
    else:
        return ''
 def write_w(dtype):
    if dtype == 'float16':
        return '__float2half_rn'
    else:
        return ''
+class Cast16(scalar.Cast):
+    def c_code(self, node, name, inputs, outputs, sub):
+        return "%s = %s;\n" % (outputs[0], inputs[0])
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -31,6 +31,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
 from .elemwise import (GpuElemwise, _is_scalar,
                       GpuDimShuffle, GpuCAReduceCuda,
                       GpuCAReduceCPY)
+from . import fp16_help
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
@@ -253,10 +254,25 @@ def local_gpuflatten(node):
 @op_lifter([tensor.Elemwise])
 def local_gpu_elemwise(node):
    op = node.op
+    scal_op = op.scalar_op
    name = op.name
    if name:
        name = 'Gpu'+name
-    res = GpuElemwise(op.scalar_op, name=name,
+    if (type(scal_op) == scalar.Cast and
+            (node.inputs[0].dtype == 'float16' or
+             node.outputs[0].dtype == 'float16')):
+        scal_op = fp16_help.Cast16(scal_op.o_type, name=scal_op.name)
+    if (type(scal_op) == scalar.Composite and
+            True):
+        inputs, outputs = gof.graph.clone(scal_op.inputs, scal_op.outputs)
+        for v in variables(inputs, outputs):
+            if (type(v.op) == scalar.Cast and
+                    (v.inputs[0].dtype == 'float16' or
+                     v.outputs[0].dtype == 'float16')):
+                # We cloned the graph before so this is ok
+                v.op = fp16_help.Cast16(v.op.o_type, name=v.op.name)
+        scal_op = scalar.Composite(inputs, outputs)
+    res = GpuElemwise(scal_op, name=name,
                      inplace_pattern=copy.copy(op.inplace_pattern),
                      nfunc_spec=op.nfunc_spec)
    return res

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -172,6 +172,7 @@ class GpuArrayType(Type):
        # complex64, etc.
        try:
            return {
+                'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
                'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
                'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
                'uint8': (int, 'npy_uint8', 'NPY_UINT8'),

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1972,6 +1972,8 @@ class Cast(UnaryScalarOp):
    def c_code(self, node, name, inputs, outputs, sub):
        (x,) = inputs
        (z,) = outputs
+        if node.inputs[0].dtype == 'float16' or node.outputs[0] == 'float16':
+            raise NotImplementedError("C code doesn't work for float16")
        return "%s = (%s)%s;" % (z, node.outputs[0].type.dtype_specs()[1], x)
    def grad(self, inputs, gout):
@@ -1997,6 +1999,7 @@ convert_to_uint8 = Cast(uint8, name='convert_to_uint8')
 convert_to_uint16 = Cast(uint16, name='convert_to_uint16')
 convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
 convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
+convert_to_float16 = Cast(float16, name='convert_to_float16')
 convert_to_float32 = Cast(float32, name='convert_to_float32')
 convert_to_float64 = Cast(float64, name='convert_to_float64')
 convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
@@ -2011,6 +2014,7 @@ _cast_mapping = {
           'uint16': convert_to_uint16,
           'uint32': convert_to_uint32,
           'uint64': convert_to_uint64,
+           'float16': convert_to_float16,
           'float32': convert_to_float32,
           'float64': convert_to_float64,
           'complex64': convert_to_complex64,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -277,10 +277,8 @@ class NumpyAutocaster(object):
        # unsafe downcast of float64 variables when config.floatX == 'float32'
        # recall: float is numpy.float
        if ((isinstance(x, float) and
-             config.floatX in self.dtypes and
+             config.floatX in self.dtypes)):
-             config.floatX == 'float32')):
+            return theano._asarray(x, dtype=config.floatX)
-            return theano._asarray(x, dtype='float32')
        for dtype in self.dtypes:
            x_ = theano._asarray(x, dtype=dtype)
@@ -290,7 +288,7 @@ class NumpyAutocaster(object):
        return x_
 autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
-autocast_float = NumpyAutocaster(('float32', 'float64'))
+autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
 # autocast_float dtypes might be manipulated in tensor.__init__
@@ -313,7 +311,7 @@ class autocast_float_as(object):
    If `config.cast_policy` is not 'custom', an exception is raised.
    For example:
-    >>> with autocast_float_as('float32') as _dummy:
+    >>> with autocast_float_as('float32'):
    ...    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
    >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
@@ -1137,6 +1135,10 @@ _convert_to_uint64 = _conversion(
    elemwise.Elemwise(scal.convert_to_uint64), 'uint64')
 """Cast to unsigned 64-bit integer"""
+_convert_to_float16 = _conversion(
+    elemwise.Elemwise(scal.convert_to_float16), 'float16')
+"""Cast to half-precision floating point"""
 _convert_to_float32 = _conversion(
    elemwise.Elemwise(scal.convert_to_float32), 'float32')
 """Cast to single-precision floating point"""
@@ -1162,6 +1164,7 @@ _cast_mapping = {
    'uint16': _convert_to_uint16,
    'uint32': _convert_to_uint32,
    'uint64': _convert_to_uint64,
+    'float16': _convert_to_float16,
    'float32': _convert_to_float32,
    'float64': _convert_to_float64,
    'complex64': _convert_to_complex64,
@@ -2752,6 +2755,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
            out = makeKeepDims(input, out, axis)
        return out
+    # float16 has very low precision so we do some things differently
+    f16 = (input.dtype == 'float16')
    if dtype is not None:
        # The summation will be done with the specified dtype.
        # sum() will complain if it is not suitable.
@@ -2760,6 +2766,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
        # Let sum() infer the appropriate dtype.
        sum_dtype = None
+    if f16 and sum_dtype is None and acc_dtype != 'float16':
+        sum_dtype = 'float32'
    s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
            acc_dtype=acc_dtype)
    shp = shape(input)
@@ -2785,6 +2794,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
    for i in axis:
        s = true_div(s, shp[i])
+    if f16:
+        s = cast(s, 'float16')
    return s

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1806,6 +1806,7 @@ class CAReduceDtype(CAReduce):
                    uint8='uint64',
                    uint16='uint64',
                    uint32='uint64',
+                    float16='float32',
                    float32='float64',
                    complex64='complex128',
                    ).get(idtype, idtype)