提交 edd1c456 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5164 from abergeron/dlt_f16_2

Fix some problems in float16.
...@@ -275,6 +275,7 @@ class GpuDot22(BlasOp): ...@@ -275,6 +275,7 @@ class GpuDot22(BlasOp):
Dot22 on the GPU. Dot22 on the GPU.
""" """
_f16_ok = True
__props__ = () __props__ = ()
def make_node(self, x, y): def make_node(self, x, y):
......
...@@ -1134,27 +1134,6 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs): ...@@ -1134,27 +1134,6 @@ def local_gpua_gemmbatch(op, context_name, inputs, outputs):
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0) return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
@register_opt2([tensor.basic.Dot], 'fast_compile')
def local_gpua_hgemm(op, context_name, inputs, outputs):
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up "
"your code.")
return
A = inputs[0]
B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = outputs[0].fgraph
C = gpu_alloc_empty(context_name, dtype='float16')(
shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@register_opt() @register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4) @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpua_gemm_alpha_merge(node, *inputs): def local_gpua_gemm_alpha_merge(node, *inputs):
......
...@@ -3,8 +3,6 @@ from unittest import TestCase ...@@ -3,8 +3,6 @@ from unittest import TestCase
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import itertools import itertools
import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
...@@ -18,7 +16,7 @@ from .test_basic_ops import makeTester, rand ...@@ -18,7 +16,7 @@ from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemmbatch_no_inplace, gpugemm_inplace, gpugemmbatch_no_inplace,
gpuger_inplace, gpuger_no_inplace, gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm) GpuGer, gpu_dot22)
GpuGemvTester = makeTester( GpuGemvTester = makeTester(
...@@ -130,52 +128,3 @@ GpuDot22Tester = makeTester( ...@@ -130,52 +128,3 @@ GpuDot22Tester = makeTester(
# test9=[rand(0, 0), rand(0, 0)], # test9=[rand(0, 0), rand(0, 0)],
) )
) )
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
v = tensor.vector(dtype='float16')
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
m32 = tensor.matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
v1 = numpy.random.random((3, 4)).astype('float16')
v2 = numpy.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = numpy.dot(v1, v2)
utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
m1 = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
b = tensor.matrix(dtype='float16')
hgemm = numpy.asarray(0.05, dtype='float16') * (tensor.dot(m1, m2) + b)
f = theano.function([m1, m2, b], hgemm, mode=mode_with_gpu)
# there should be 3 gpu_from_host, 1 hgemm and 1 host_from_gpu
assert len(f.maker.fgraph.apply_nodes) == 5
...@@ -18,6 +18,7 @@ from copy import copy ...@@ -18,6 +18,7 @@ from copy import copy
from textwrap import dedent from textwrap import dedent
import numpy import numpy
import six
from six.moves import xrange from six.moves import xrange
import theano import theano
...@@ -121,33 +122,165 @@ def as_scalar(x, name=None): ...@@ -121,33 +122,165 @@ def as_scalar(x, name=None):
raise TypeError("Cannot convert %s to Scalar" % x, type(x)) raise TypeError("Cannot convert %s to Scalar" % x, type(x))
def constant(x): class NumpyAutocaster(object):
# pass through numpy scalars, since they are already typed on """
This class is used to cast python ints and floats to numpy arrays.
The behavior when called on scalar `x` depends on `config.cast_policy`:
- 'numpy' will simply use the same type as found by `numpy.asarray(x)`.
- 'numpy+floatX' will do the same, except it will use float32 instead
of float64 if `x` is a Python float and `config.floatX` is set to
'float32' (note that if `x` is a numpy scalar whose data type is
float64, it is not modified since we assume the user is purposedly
using float64).
- 'custom' lets one define a tuple of data types such that:
- if `x` is already a numpy scalar and its data type is in this
tuple, then it is returned unchanged;
- otherwise, the first data type in this tuple that can represent
`x` without loss of precision will be used, unless `x` is a float
and 'float32' is in the tuple (in which case `x` is cast as a
float32);
- if no data type can represent `x` without loss of precision, then
the last data type in the tuple will be used.
Parameters
----------
dtypes: tuple of strings
The ordered list of preferred data types (only used when
`config.cast_policy` is set to 'custom', see the `NumpyAutocaster`
help for details).
"""
def __init__(self, dtypes):
self.dtypes = tuple(dtypes)
def __call__(self, x):
# Make sure we only deal with scalars.
assert (isinstance(x, six.integer_types) or
isinstance(x, builtin_float) or
(isinstance(x, numpy.ndarray) and x.ndim == 0))
if config.cast_policy == 'numpy':
return numpy.asarray(x)
elif config.cast_policy == 'numpy+floatX':
rval = numpy.asarray(x)
if ((not hasattr(x, 'dtype') and
rval.dtype in ('float64', 'float32') and
rval.dtype != config.floatX)):
rval = theano._asarray(rval, dtype=config.floatX)
return rval
# The following is the original code, corresponding to the 'custom'
# option for `config.cast_policy`.
assert config.cast_policy == 'custom'
try:
# Pass through numpy scalars, since they are already typed on
# purpose typically. # purpose typically.
if hasattr(x, 'dtype'): if str(x.dtype) in self.dtypes:
assert x.ndim == 0 # No need to cast `x` into a new dtype. Note that we still
return ScalarConstant(get_scalar_type(str(x.dtype)), x) # need to convert it into an array, because it may not be
if isinstance(x, builtin_float): # one already (e.g. if x == numpy.float64(1.1)).
for dtype in ['float32', 'float64']: return numpy.asarray(x)
except AttributeError:
# Means `x` has no 'dtype' attribute.
pass
# unsafe downcast of float64 variables when config.floatX == 'float32'
# recall: float is numpy.float
if ((isinstance(x, float) and
config.floatX in self.dtypes and
config.floatX != 'float64')):
return theano._asarray(x, dtype=config.floatX)
# Don't autocast to float16 unless config.floatX is float16
try_dtypes = [d for d in self.dtypes
if config.floatX == 'float16' or d != 'float16']
for dtype in try_dtypes:
x_ = theano._asarray(x, dtype=dtype) x_ = theano._asarray(x, dtype=dtype)
if numpy.all(x == x_): if numpy.all(x == x_):
break break
x_ = None # returns either an exact x_==x, or the last cast x_
assert x_ is not None return x_
return ScalarConstant(get_scalar_type(str(x_.dtype)), x)
if isinstance(x, builtin_int): autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
for dtype in ['int8', 'int16', 'int32', 'int64']: # autocast_float dtypes might be manipulated in tensor.*
autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
class autocast_float_as(object):
"""
Temporarily adjust autocasting behavior.
This class makes it possible to temporarily and locally adjust autocasting
behavior when `config.cast_policy` is set to 'custom'.
If `config.cast_policy` is not 'custom', an exception is raised.
This class might be convenient in some code, but it definitely
helps to test the autocasting mechanism.
Examples
--------
>>> with autocast_float_as('float32'):
... assert (fvector() + 1.1).dtype == 'float32' # temporary downcasting
>>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
"""
def __init__(self, *dtypes):
self.dtypes = dtypes
assert config.cast_policy == 'custom'
def __enter__(self):
assert config.cast_policy == 'custom'
self.old_dtypes = autocast_float.dtypes
autocast_float.dtypes = self.dtypes
def __exit__(self, *args):
assert config.cast_policy == 'custom'
autocast_float.dtypes = self.old_dtypes
def convert(x, dtype=None):
"""
Convert the input to a properly typed numpy value according to the
current casting policy. Work with scalars and tensors.
"""
if dtype is not None:
# in this case, the semantics are that the caller is forcing the dtype
x_ = theano._asarray(x, dtype=dtype) x_ = theano._asarray(x, dtype=dtype)
if numpy.all(x == x_): else:
break # In this case, this function should infer the dtype according to the
# autocasting rules. See autocasting above.
x_ = None x_ = None
assert x_ is not None if isinstance(x, six.integer_types):
return ScalarConstant(get_scalar_type(str(x_.dtype)), x) try:
if isinstance(x, builtin_complex): x_ = autocast_int(x)
# TODO: We have added the complex type, so this should be tested except OverflowError:
raise NotImplementedError() # This is to imitate numpy behavior which tries to fit
raise TypeError(x) # bigger numbers into a uint64.
# return ScalarConstant(float64, float(x)) x_ = theano._asarray(x, dtype='uint64')
elif isinstance(x, builtin_float):
x_ = autocast_float(x)
elif isinstance(x, numpy.ndarray):
x_ = x
else:
# Here x is probably a list or a tuple. If it contains a
# long, we will behave like the current NumPy version: it
# will work if the long fits in int64 or uint64.
x_ = numpy.asarray(x)
if x_.size == 0 and not hasattr(x, 'dtype'):
x_ = numpy.asarray(x, dtype=config.floatX)
assert type(x_) in [numpy.ndarray, numpy.memmap]
return x_
def constant(x):
x = convert(x)
assert x.ndim == 0
return ScalarConstant(get_scalar_type(str(x.dtype)), x)
class Scalar(Type): class Scalar(Type):
......
...@@ -219,138 +219,6 @@ _as_tensor_variable = as_tensor_variable ...@@ -219,138 +219,6 @@ _as_tensor_variable = as_tensor_variable
as_tensor = as_tensor_variable as_tensor = as_tensor_variable
class NumpyAutocaster(object):
"""
This class is used to cast python ints and floats to numpy arrays.
The behavior when called on scalar `x` depends on `config.cast_policy`:
- 'numpy' will simply use the same type as found by `numpy.asarray(x)`.
- 'numpy+floatX' will do the same, except it will use float32 instead
of float64 if `x` is a Python float and `config.floatX` is set to
'float32' (note that if `x` is a numpy scalar whose data type is
float64, it is not modified since we assume the user is purposedly
using float64).
- 'custom' lets one define a tuple of data types such that:
- if `x` is already a numpy scalar and its data type is in this
tuple, then it is returned unchanged;
- otherwise, the first data type in this tuple that can represent
`x` without loss of precision will be used, unless `x` is a float
and 'float32' is in the tuple (in which case `x` is cast as a
float32);
- if no data type can represent `x` without loss of precision, then
the last data type in the tuple will be used.
Parameters
----------
dtypes: tuple of strings
The ordered list of preferred data types (only used when
`config.cast_policy` is set to 'custom', see the `NumpyAutocaster`
help for details).
"""
def __init__(self, dtypes):
self.dtypes = tuple(dtypes)
def __call__(self, x):
# Make sure we only deal with scalars.
assert (isinstance(x, integer_types) or
isinstance(x, float) or
(isinstance(x, numpy.ndarray) and x.ndim == 0))
if config.cast_policy == 'numpy':
return numpy.asarray(x)
elif config.cast_policy == 'numpy+floatX':
rval = numpy.asarray(x)
if ((not hasattr(x, 'dtype') and
rval.dtype in ('float64', 'float32') and
rval.dtype != config.floatX)):
rval = theano._asarray(rval, dtype=config.floatX)
return rval
# The following is the original code, corresponding to the 'custom'
# option for `config.cast_policy`.
assert config.cast_policy == 'custom'
try:
# Pass through numpy scalars, since they are already typed on
# purpose typically.
if str(x.dtype) in self.dtypes:
# No need to cast `x` into a new dtype. Note that we still
# need to convert it into an array, because it may not be
# one already (e.g. if x == numpy.float64(1.1)).
return numpy.asarray(x)
except AttributeError:
# Means `x` has no 'dtype' attribute.
pass
# unsafe downcast of float64 variables when config.floatX == 'float32'
# recall: float is numpy.float
if ((isinstance(x, float) and
config.floatX in self.dtypes and
config.floatX != 'float64')):
return theano._asarray(x, dtype=config.floatX)
# Don't autocast to float16 unless config.floatX is float16
try_dtypes = [d for d in self.dtypes
if config.floatX == 'float16' or d != 'float16']
for dtype in try_dtypes:
x_ = theano._asarray(x, dtype=dtype)
if numpy.all(x == x_):
break
# returns either an exact x_==x, or the last cast x_
return x_
autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
# autocast_float dtypes might be manipulated in tensor.__init__
#
# Note: it's a bit weird for a compiler to automatically downcast
# literals like this, and it might have implications for efficiency
# when mixing types. For example when you add 1.0 + dmatrix(), the
# 1.0 could be converted to float32, and require upcasting for the +
# operation at every position in the dmatrix. using
# theano._asarray(1.0, dtype='float64') will circumvent this
# autocasting, and in future, our ops might be smarter about factoring
# out upcasts. The advantage of this mechanism is to combine it with
# floatX so that 1.0 + xmatrix() will always have the same type as the
# xmatrix().
#
class autocast_float_as(object):
"""
Temporarily adjust autocasting behavior.
This class makes it possible to temporarily and locally adjust autocasting
behavior when `config.cast_policy` is set to 'custom'.
If `config.cast_policy` is not 'custom', an exception is raised.
This class might be convenient in some code, but it definitely
helps to test the autocasting mechanism.
Examples
--------
>>> with autocast_float_as('float32'):
... assert (fvector() + 1.1).dtype == 'float32' # temporary downcasting
>>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
"""
def __init__(self, *dtypes):
self.dtypes = dtypes
assert config.cast_policy == 'custom'
def __enter__(self):
assert config.cast_policy == 'custom'
self.old_dtypes = autocast_float.dtypes
autocast_float.dtypes = self.dtypes
def __exit__(self, *args):
assert config.cast_policy == 'custom'
autocast_float.dtypes = self.old_dtypes
def constant_or_value(x, rtype, name=None, ndim=None, dtype=None): def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
"""Return a symbolic `Constant` with value `x`. """Return a symbolic `Constant` with value `x`.
...@@ -362,32 +230,7 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None): ...@@ -362,32 +230,7 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
`x` could not be expanded to have ndim dimensions. `x` could not be expanded to have ndim dimensions.
""" """
if dtype is not None: x_ = scal.convert(x, dtype=dtype)
# in this case, the semantics are that the caller is forcing the dtype
x_ = theano._asarray(x, dtype=dtype)
else:
# In this case, this function should infer the dtype according to the
# autocasting rules. See autocasting above.
x_ = None
if rtype is TensorConstant and isinstance(x, integer_types):
try:
x_ = autocast_int(x)
except OverflowError:
# This is to imitate numpy behavior which tries to fit
# bigger numbers into a uint64.
x_ = theano._asarray(x, dtype='uint64')
elif rtype is TensorConstant and isinstance(x, float):
x_ = autocast_float(x)
elif isinstance(x, numpy.ndarray):
x_ = x
else:
# Here x is probably a list or a tuple. If it contains a
# long, we will behave like the current NumPy version: it
# will work if the long fits in int64 or uint64.
x_ = numpy.asarray(x)
if x_.size == 0 and not hasattr(x, 'dtype'):
x_ = numpy.asarray(x, dtype=config.floatX)
assert type(x_) in [numpy.ndarray, numpy.memmap]
bcastable = [d == 1 for d in x_.shape] bcastable = [d == 1 for d in x_.shape]
if ndim is not None: if ndim is not None:
...@@ -3155,11 +2998,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, ...@@ -3155,11 +2998,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
sum_dtype = dtype sum_dtype = dtype
else: else:
sum_dtype = None sum_dtype = None
# float16 overflows on the cast way too often
# float16 overflows way too fast for sum if input.dtype == 'float16':
if ((sum_dtype == 'float16' or input.dtype == 'float16') and sum_dtype = 'float32'
acc_dtype != 'float16'):
sum_dtype == 'float32'
s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims, s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
acc_dtype=acc_dtype) acc_dtype=acc_dtype)
......
...@@ -1093,14 +1093,14 @@ def _as_scalar(res, dtype=None): ...@@ -1093,14 +1093,14 @@ def _as_scalar(res, dtype=None):
def _is_real_matrix(res): def _is_real_matrix(res):
return (res.type.dtype in ('float32', 'float64') and return (res.type.dtype in ('float16', 'float32', 'float64') and
res.type.ndim == 2 and res.type.ndim == 2 and
res.type.broadcastable[0] is False and res.type.broadcastable[0] is False and
res.type.broadcastable[1] is False) # cope with tuple vs. list res.type.broadcastable[1] is False) # cope with tuple vs. list
def _is_real_vector(res): def _is_real_vector(res):
return (res.type.dtype in ('float32', 'float64') and return (res.type.dtype in ('float16', 'float32', 'float64') and
res.type.ndim == 1 and res.type.ndim == 1 and
res.type.broadcastable[0] is False) res.type.broadcastable[0] is False)
...@@ -1195,7 +1195,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -1195,7 +1195,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
return None return None
if ((r.type.ndim not in (1, 2)) or if ((r.type.ndim not in (1, 2)) or
r.type.dtype not in ('float32', 'float64', r.type.dtype not in ('float16', 'float32', 'float64',
'complex64', 'complex128')): 'complex64', 'complex128')):
rval.append(scaled(r)) rval.append(scaled(r))
return rval return rval
...@@ -1528,7 +1528,7 @@ class Dot22(GemmRelated): ...@@ -1528,7 +1528,7 @@ class Dot22(GemmRelated):
""" """
def make_node(self, x, y): def make_node(self, x, y):
dtypes = ('float32', 'float64', 'complex64', 'complex128') dtypes = ('float16', 'float32', 'float64', 'complex64', 'complex128')
if x.type.ndim != 2 or x.type.dtype not in dtypes: if x.type.ndim != 2 or x.type.dtype not in dtypes:
raise TypeError(x) raise TypeError(x)
if y.type.ndim != 2 or y.type.dtype not in dtypes: if y.type.ndim != 2 or y.type.dtype not in dtypes:
...@@ -1621,7 +1621,7 @@ def local_dot_to_dot22(node): ...@@ -1621,7 +1621,7 @@ def local_dot_to_dot22(node):
x, y, x.type, y.type) x, y, x.type, y.type)
return return
if y.type.dtype in ['float32', 'float64', 'complex64', 'complex128']: if y.type.dtype in ['float16', 'float32', 'float64', 'complex64', 'complex128']:
if x.ndim == 2 and y.ndim == 2: if x.ndim == 2 and y.ndim == 2:
# print "local_dot_to_dot22: MM" # print "local_dot_to_dot22: MM"
return [_dot22(*node.inputs)] return [_dot22(*node.inputs)]
......
...@@ -26,11 +26,12 @@ from six.moves import StringIO, reduce ...@@ -26,11 +26,12 @@ from six.moves import StringIO, reduce
from theano import compile, config, function, gof, tensor, shared from theano import compile, config, function, gof, tensor, shared
from theano.compile import DeepCopyOp from theano.compile import DeepCopyOp
from theano.compile.mode import get_default_mode from theano.compile.mode import get_default_mode
from theano.tensor import (_shared, wvector, bvector, autocast_float_as, from theano.scalar import autocast_float_as, autocast_float
from theano.tensor import (_shared, wvector, bvector,
argmin, max_and_argmax, cscalar, ctensor3, join, argmin, max_and_argmax, cscalar, ctensor3, join,
horizontal_stack, vertical_stack, argmax, get_vector_length, horizontal_stack, vertical_stack, argmax, get_vector_length,
fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast, fscalar, zeros_like, sum, tensor3, vector, add, addbroadcast,
alloc, as_tensor_variable, tensor_from_scalar, ARange, autocast_float, alloc, as_tensor_variable, tensor_from_scalar, ARange,
clip, constant, default, dot, batched_dot, clip, constant, default, dot, batched_dot,
dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation, dmatrix, dscalar, dvector, eq, eye, fill, flatten, inverse_permutation,
tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad, tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
...@@ -4595,6 +4596,12 @@ class T_mean(unittest.TestCase): ...@@ -4595,6 +4596,12 @@ class T_mean(unittest.TestCase):
except AttributeError: except AttributeError:
self.fail() self.fail()
def test_mean_f16(self):
x = tensor.vector(dtype='float16')
y = x.mean()
f = theano.function([x], y)
utt.assert_allclose(f(numpy.ones((100000,), dtype='float16')), 1.0)
def test0(self): def test0(self):
# Simple test... # Simple test...
x = tensor.vector() x = tensor.vector()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论