提交 907d1868 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2901 from abergeron/f16_lstm

Work to make the DLT LSTM work with float16.
......@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker):
node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs]
compute_map = {}
for k in node.inputs:
compute_map[k] = [True]
for k in node.outputs:
compute_map[k] = [False]
# Some Ops define a make_thunk with the expectation that
# it will be called before the C code is compiled, because
# the compilation of some dependency is triggered there.
thunk_other = None
if get_unbound_function(node.op.make_thunk) not in default_make_thunk:
compute_map = {}
for k in node.inputs:
compute_map[k] = [True]
for k in node.outputs:
compute_map[k] = [False]
thunk = node.op.make_thunk(node,
storage_map,
compute_map,
......@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker):
raise utils.MethodNotDefined()
# Ops that do not inherit from gof.op.Op don't have certain
# methods defined that the CLinker expects (Scan is an
# exmaple, ifelse is another of such classes that inherit
# example, ifelse is another of such classes that inherit
# directly from PureOp)
if not isinstance(node.op, gof.op.Op):
raise utils.MethodNotDefined()
e = FunctionGraph(node.inputs, node.outputs)
# The toposort isn't a stochastic order as it contain only one node.
e.toposort = lambda: list(e.apply_nodes)
# Specifically... e.nodes is a set, but of only 1 element
cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
node.outputs)
if r2 in no_recycling])
thunk, node_input_filters, node_output_filters = cl.make_thunk(
input_storage=node_input_storage,
output_storage=node_output_storage)
thunk.inputs = node_input_storage
thunk.outputs = node_output_storage
thunk = node.op.make_c_thunk(node, storage_map, compute_map,
no_recycling)
thunks_c.append(thunk)
except (NotImplementedError, utils.MethodNotDefined):
thunks_c.append(None)
......@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker):
# consider that we don't have a python implementation
if ((self.maker.mode.check_py_code or thunks_c[-1] is None) and
node.op.perform.func_code != gof.op.PureOp.perform.func_code):
p = node.op.perform
ctx = node.run_context()
if ctx is graph.NoContext:
thunk = (lambda p=p, i=node_input_storage,
o=node_output_storage,
n=node: p(n, [x[0] for x in i], o))
else:
ctx_val = node.context_type.filter(ctx)
thunk = (lambda p=p, i=node_input_storage,
o=node_output_storage, ctx=ctx_val,
n=node: p(n, [x[0] for x in i], o, ctx))
thunk.inputs = node_input_storage
thunk.outputs = node_output_storage
thunk.perform = p
thunk = node.op.make_py_thunk(node, storage_map, compute_map,
no_recycling)
thunks_py.append(thunk)
else:
thunks_py.append(None)
......
......@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op):
..note: works inplace and works for CudaNdarrayType
"""
view_map = {0: [0]}
_f16_ok = True
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
......
......@@ -699,78 +699,55 @@ class Op(utils.object2, PureOp, CLinkerOp):
else:
return NotImplemented
def make_thunk(self, node, storage_map, compute_map, no_recycling):
def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
"""
:param node: something previously returned by self.make_node
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
Like make_thunk, but will only try to make a C thunk.
"""
logger = logging.getLogger('theano.gof.op.Op')
node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs]
node_input_compute = [compute_map[r] for r in node.inputs]
node_output_compute = [compute_map[r] for r in node.outputs]
if self._op_use_c_code:
try:
# float16 get special treatment since running
# unprepared C code will get bad results.
if not getattr(self, '_f16_ok', False):
def is_f16(t):
return getattr(t, 'dtype', '') == 'float16'
if (any(is_f16(i.type) for i in node.inputs) or
any(is_f16(o.type) for o in node.outputs)):
print ("Disabling C code for %s due to unsupported "
"float16" % (self,))
raise NotImplementedError("float16")
e = FunctionGraph(node.inputs, node.outputs)
e_no_recycling = [new_o
for (new_o, old_o) in zip(e.outputs, node.outputs)
if old_o in no_recycling]
cl = theano.gof.cc.CLinker().accept(e,
no_recycling=e_no_recycling)
logger.debug('Trying CLinker.make_thunk')
outputs = cl.make_thunk(input_storage=node_input_storage,
output_storage=node_output_storage)
fill_storage, node_input_filters, node_output_filters = outputs
def rval():
fill_storage()
for o in node.outputs:
compute_map[o][0] = True
rval.cthunk = fill_storage.cthunk
rval.inputs = node_input_storage
rval.outputs = node_output_storage
rval.lazy = False
return rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del rval
except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform')
# float16 gets special treatment since running
# unprepared C code will get bad results.
if not getattr(self, '_f16_ok', False):
def is_f16(t):
return getattr(t, 'dtype', '') == 'float16'
if (any(is_f16(i.type) for i in node.inputs) or
any(is_f16(o.type) for o in node.outputs)):
print ("Disabling C code for %s due to unsupported "
"float16" % (self,))
raise NotImplementedError("float16")
e = FunctionGraph(node.inputs, node.outputs)
e_no_recycling = [new_o
for (new_o, old_o) in zip(e.outputs, node.outputs)
if old_o in no_recycling]
cl = theano.gof.cc.CLinker().accept(e,
no_recycling=e_no_recycling)
logger.debug('Trying CLinker.make_thunk')
outputs = cl.make_thunk(input_storage=node_input_storage,
output_storage=node_output_storage)
fill_storage, node_input_filters, node_output_filters = outputs
def rval():
fill_storage()
for o in node.outputs:
compute_map[o][0] = True
rval.cthunk = fill_storage.cthunk
rval.inputs = node_input_storage
rval.outputs = node_output_storage
rval.lazy = False
return rval
# condition: either there was no c_code, or it failed
def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
"""
Like make_thunk() but only makes python thunks.
"""
node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs]
p = node.op.perform
......@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.lazy = False
return rval
def make_thunk(self, node, storage_map, compute_map, no_recycling):
"""
:param node: something previously returned by self.make_node
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
logger = logging.getLogger('theano.gof.op.Op')
if self._op_use_c_code:
try:
return self.make_c_thunk(node, storage_map, compute_map,
no_recycling)
except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform')
# condition: either there was no c_code, or it failed
return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
def get_test_value(v):
"""
......
......@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
]
@code_version((1,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version((2,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init,
b='', stride_b='', dtype='float32'):
b='', stride_b='', load_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype)
:param x: input data
:param stride_x: input data stride
:param load_x: wrapper to read from x
:param pos: index of executing thread
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the output
:param manner_fn: a function that accepts strings of arguments a
......@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
"""
if b:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
" %(b)s[%(pos)s * %(stride_b)s]" % locals())
init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
" %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
loop_line = manner_fn("red",
manner_init("%(x)s[i * %(stride_x)s] + "
"%(b)s[i * %(stride_b)s]" %
manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
"%(load_b)s(%(b)s[i * %(stride_b)s])" %
locals()))
else:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
......@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
b='', stride_b='', dtype='float32'):
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, pos, count,
b='', stride_b='', load_b='',
dtype='float32'):
return inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a,
b, stride_b, dtype)
b, stride_b, load_b, dtype)
@code_version((1,) + inline_reduce_max.code_version +
@code_version((2,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x,
sm, sm_stride,
def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
sm, sm_stride, write_sm,
threadPos, threadCount,
b='', stride_b='', dtype="float32"):
b='', stride_b='', load_b='',
dtype="float32"):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param load_x: wrapper to read from x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param write_sm: wrapper before writing to sm
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the softmax's output if not float32
:Precondition: buf is empty
......@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
"""
ret = [
# get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x,
threadPos, threadCount, b, stride_b,
inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
threadPos, threadCount,
b, stride_b, load_b,
dtype),
'__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype,
'__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a,
b, stride_b, dtype),
b, stride_b, load_b, dtype),
'__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype,
'__syncthreads()',
......@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
if b:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
" exp(%(x)s[tx * %(stride_x)s] +"
" %(b)s[tx * %(stride_b)s] - row_max)"
" / row_sum" % locals()]
" %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
" %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
" / row_sum)" % locals()]
else:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
"exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
"%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
" / row_sum)" % locals()]
ret += [
"}",
'__syncthreads()',
......
......@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor):
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
@property
def _f16_ok(self):
return self.iadd_node.op._f16_ok
def c_headers(self):
return self.iadd_node.op.c_headers()
......@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor):
PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL;
""" % locals()
# def c_code(self, node, name, inputs, outputs, sub):
inputs = ["dst", "src"]
outputs = ["ret"]
sub = {"fail": "return NULL;"}
......@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor):
return ret
def add_to_zview(self, nodename, x, fail):
# TODO
return """
PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
......@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version:
return
return parent_version + elemwise_version + (1,)
return parent_version + elemwise_version + (2,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
......@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def getInplElemwiseAdditionKernel(self, a, b):
if a.dtype == 'float16' or b.dtype == 'float16':
raise NotImplementedError('float16 is not supported by pygpu '
'elemwise')
a_arg = pygpu.tools.as_argument(a, 'a')
b_arg = pygpu.tools.as_argument(b, 'b')
args = [a_arg, b_arg]
......@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
"""
def __init__(self, inplace=False, set_instead_of_inc=False):
# The python implementation in the parent class is not applicable here
GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
_f16_ok = True
def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
......@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_out = numpy.dtype(dtype_out).itemsize
return """
/*
* This is a version of atomicAdd that works for half-floats. It may
* read and write 2 bytes more than the size of the array if the array
* has an uneven number of elements. The actual value at that spot
* will not be modified.
*/
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
npy_uint32 old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410);
}
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int stridesX0,
......
......@@ -136,6 +136,12 @@ class GpuArrayType(Type):
raise NotImplementedError(
"GpuArrayType.values_eq_approx() don't implemented the"
" allow_remove_inf and allow_remove_nan parameter")
if a.dtype == 'float16' or b.dtype == 'float16':
an = numpy.asarray(a)
bn = numpy.asarray(b)
return tensor.TensorType.values_eq_approx(
an, bn, allow_remove_inf=allow_remove_inf,
allow_remove_nan=allow_remove_nan, rtol=rtol, atol=atol)
narrow = 'float32', 'complex64'
if (str(a.dtype) in narrow) or (str(b.dtype) in narrow):
atol_ = theano.tensor.basic.float32_atol
......@@ -153,6 +159,13 @@ class GpuArrayType(Type):
locals())
return numpy.asarray(res).all()
@staticmethod
def may_share_memory(a, b):
if (not isinstance(a, gpuarray.GpuArray) or
not isinstance(b, gpuarray.GpuArray)):
return False
return pygpu.gpuarray.may_share_memory(a, b)
def value_zeros(self, shape):
return pygpu.gpuarray.zeros(shape, dtype=self.typecode)
......
......@@ -28,6 +28,7 @@ if cuda_available:
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.fp16_help import write_w
def matVecModM(A, s, m):
......@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base):
def perform(self, node, inp, out):
rstate, size = inp
o_rstate, o_sample = out
numpy_version = numpy.__version__.split('.')
if (not self.warned_numpy_version and
int(numpy_version[0]) <= 1 and
int(numpy_version[1]) < 3):
print("Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy")
self.warned_numpy_version = True
n_elements = 1
rstate = numpy.asarray(rstate) # bring state from GPU if necessary
......@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base):
def c_code(self, node, name, inp, out, sub):
rstate, size = inp
# If we try to use the C code here with something else than a
# TensorType, something is wrong (likely one of the GPU ops
# not defining C code correctly).
assert isinstance(node.inputs[0].type, TensorType)
o_rstate, o_sample = out
if self.inplace:
o_rstate_requirement = 'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED'
......@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
_f16_ok = True
@classmethod
def new(cls, rstate, ndim, dtype, size):
......@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def gpu_kernels(self, node, name):
if self.output_type.dtype == 'float32':
write = write_w(self.output_type.dtype)
if self.output_type.dtype == 'float16':
otype = 'ga_half'
# limit the values of the state that we use.
mask = '& 0x7fff'
NORM = '3.0518e-05f' # numpy.float16(1.0/(2**15+8))
# this was determined by finding the biggest number such that
# numpy.float16(number * (M1 & 0x7fff)) < 1.0
elif self.output_type.dtype == 'float32':
otype = 'float'
mask = ''
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
else:
elif self.output_type.dtype == 'float64':
otype = 'double'
mask = ''
NORM = '4.656612873077392578125e-10'
else:
raise ValueError('Unsupported data type for output',
self.output_type.dtype)
code = """
KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data,
......@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x21 = y2;
if (x11 <= x21) {
sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
}
else
{
sample_data[i] = (x11 - x21) * %(NORM)s;
sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
}
}
......@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
kname = self.gpu_kernels(node, nodename)[0].objvar
if self.output_type.dtype == 'float32':
otype = 'float'
otypecode = 'GA_FLOAT'
else:
otype = 'double'
otypecode = 'GA_DOUBLE'
otypecode = str(self.output_type.typecode)
return """
//////// <code generated by mrg_uniform>
size_t odims[%(ndim)s];
unsigned int n_elements = 1;
unsigned int n_streams;
......@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)s
}
}
//////// </ code generated by mrg_uniform>
""" % locals()
def c_code_cache_version(self):
return (3, self.GpuKernelBase_version)
return (6, self.GpuKernelBase_version)
def guess_n_streams(size, warn=False):
......
......@@ -3842,6 +3842,7 @@ class Reshape(Op):
The number of dimensions to which to reshape to (ndim) must be
known at graph build time."""
view_map = {0: [0]} # output 0 is potentially aliased to inputs [0]
_f16_ok = True
check_input = False
......
......@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
# We add boundary checks prevent exp from generating inf or
# 0. The reset of the logic always generate 0 or 1 in those
# cases. This is a speed optimization.
# The constants were obtained by looking at the output of python commands like:
"""
import numpy, theano
dt='float32' # or float64
for i in xrange(750):
print i, repr(theano._asarray(1.0, dtype=dt) /
(theano._asarray(1.0, dtype=dt) +
numpy.exp(-theano._asarray([i,-i], dtype=dt))))
"""
if node.inputs[0].type == scalar.float32:
# The constants were obtained by looking at the output of
# python commands like:
#
# import numpy, theano
# dt='float32' # or float64
# for i in xrange(750):
# print i, repr(theano._asarray(1.0, dtype=dt) /
# (theano._asarray(1.0, dtype=dt) +
# numpy.exp(-theano._asarray([i,-i], dtype=dt))))
# float16 limits: -11.0, 7.0f
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if (node.inputs[0].type == scalar.float32 or
node.inputs[0].type == scalar.float16):
return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
elif node.inputs[0].type == scalar.float64:
return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
......@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
if node.inputs[0].type == scalar.float32:
# These constants were obtained by looking at the output of python commands like:
# for i in xrange(750):
# print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# the boundary checks prevent us from generating inf
# These constants were obtained by looking at the output of
# python commands like:
# for i in xrange(750):
# print i, repr(numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# the boundary checks prevent us from generating inf
# float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if (node.inputs[0].type == scalar.float32 or
node.inputs[0].type == scalar.float16):
return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
elif node.inputs[0].type == scalar.float64:
return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
......
......@@ -5151,7 +5151,8 @@ def local_log_erfc(node):
T.log(1 - 1 / (2 * x ** 2) + 3 / (4 * x ** 4)
- 15 / (8 * x ** 6)))
if node.outputs[0].dtype == 'float32':
if (node.outputs[0].dtype == 'float32' or
node.outputs[0].dtype == 'float16'):
threshold = 10.0541949
elif node.outputs[0].dtype == 'float64':
threshold = 26.641747557
......@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node):
3 / (4 * (x ** 4)) - 15 / (8 * (x ** 6)), -1)
* T.cast(T.sqrt(numpy.pi), dtype=x.dtype))
if x.dtype == 'float32':
if x.dtype == 'float32' or x.dtype == 'float16':
threshold = 9.3
#threshold = 10.1
elif x.dtype == 'float64':
......
......@@ -291,6 +291,7 @@ class Subtensor(Op):
debug = 0
check_input = False
view_map = {0: [0]}
_f16_ok = True
@staticmethod
def collapse(idxs, cond):
......@@ -328,7 +329,7 @@ class Subtensor(Op):
TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen?
"""
invalid_scal_types = [scal.float64, scal.float32]
invalid_scal_types = [scal.float64, scal.float32, scal.float16]
scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
tensor_types = [theano.tensor.lscalar, theano.tensor.iscalar,
theano.tensor.wscalar, theano.tensor.bscalar]
......@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op):
# sparse_grad doesn't go in here since it only affects the output
# of the grad() method.
__props__ = ()
_f16_ok = True
def __init__(self, sparse_grad=False):
self.sparse_grad = sparse_grad
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论