提交 907d1868 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2901 from abergeron/f16_lstm

Work to make the DLT LSTM work with float16.
...@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker): ...@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker):
node_input_storage = [storage_map[r] for r in node.inputs] node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs] node_output_storage = [storage_map[r] for r in node.outputs]
# Some Ops define a make_thunk with the expectation that
# it will be called before the C code is compiled, because
# the compilation of some dependency is triggered there.
thunk_other = None
if get_unbound_function(node.op.make_thunk) not in default_make_thunk:
compute_map = {} compute_map = {}
for k in node.inputs: for k in node.inputs:
compute_map[k] = [True] compute_map[k] = [True]
for k in node.outputs: for k in node.outputs:
compute_map[k] = [False] compute_map[k] = [False]
# Some Ops define a make_thunk with the expectation that
# it will be called before the C code is compiled, because
# the compilation of some dependency is triggered there.
thunk_other = None
if get_unbound_function(node.op.make_thunk) not in default_make_thunk:
thunk = node.op.make_thunk(node, thunk = node.op.make_thunk(node,
storage_map, storage_map,
compute_map, compute_map,
...@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker): ...@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker):
raise utils.MethodNotDefined() raise utils.MethodNotDefined()
# Ops that do not inherit from gof.op.Op don't have certain # Ops that do not inherit from gof.op.Op don't have certain
# methods defined that the CLinker expects (Scan is an # methods defined that the CLinker expects (Scan is an
# exmaple, ifelse is another of such classes that inherit # example, ifelse is another of such classes that inherit
# directly from PureOp) # directly from PureOp)
if not isinstance(node.op, gof.op.Op): if not isinstance(node.op, gof.op.Op):
raise utils.MethodNotDefined() raise utils.MethodNotDefined()
e = FunctionGraph(node.inputs, node.outputs)
# The toposort isn't a stochastic order as it contain only one node. thunk = node.op.make_c_thunk(node, storage_map, compute_map,
e.toposort = lambda: list(e.apply_nodes) no_recycling)
# Specifically... e.nodes is a set, but of only 1 element
cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
node.outputs)
if r2 in no_recycling])
thunk, node_input_filters, node_output_filters = cl.make_thunk(
input_storage=node_input_storage,
output_storage=node_output_storage)
thunk.inputs = node_input_storage
thunk.outputs = node_output_storage
thunks_c.append(thunk) thunks_c.append(thunk)
except (NotImplementedError, utils.MethodNotDefined): except (NotImplementedError, utils.MethodNotDefined):
thunks_c.append(None) thunks_c.append(None)
...@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker): ...@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker):
# consider that we don't have a python implementation # consider that we don't have a python implementation
if ((self.maker.mode.check_py_code or thunks_c[-1] is None) and if ((self.maker.mode.check_py_code or thunks_c[-1] is None) and
node.op.perform.func_code != gof.op.PureOp.perform.func_code): node.op.perform.func_code != gof.op.PureOp.perform.func_code):
p = node.op.perform thunk = node.op.make_py_thunk(node, storage_map, compute_map,
ctx = node.run_context() no_recycling)
if ctx is graph.NoContext:
thunk = (lambda p=p, i=node_input_storage,
o=node_output_storage,
n=node: p(n, [x[0] for x in i], o))
else:
ctx_val = node.context_type.filter(ctx)
thunk = (lambda p=p, i=node_input_storage,
o=node_output_storage, ctx=ctx_val,
n=node: p(n, [x[0] for x in i], o, ctx))
thunk.inputs = node_input_storage
thunk.outputs = node_output_storage
thunk.perform = p
thunks_py.append(thunk) thunks_py.append(thunk)
else: else:
thunks_py.append(None) thunks_py.append(None)
......
...@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op): ...@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op):
..note: works inplace and works for CudaNdarrayType ..note: works inplace and works for CudaNdarrayType
""" """
view_map = {0: [0]} view_map = {0: [0]}
_f16_ok = True
# Mapping from Type to C code (and version) to use. # Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s, # In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s. # the output variable is %(oname)s.
......
...@@ -699,37 +699,16 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -699,37 +699,16 @@ class Op(utils.object2, PureOp, CLinkerOp):
else: else:
return NotImplemented return NotImplemented
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
""" """
:param node: something previously returned by self.make_node Like make_thunk, but will only try to make a C thunk.
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
""" """
logger = logging.getLogger('theano.gof.op.Op') logger = logging.getLogger('theano.gof.op.Op')
node_input_storage = [storage_map[r] for r in node.inputs] node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs] node_output_storage = [storage_map[r] for r in node.outputs]
node_input_compute = [compute_map[r] for r in node.inputs]
node_output_compute = [compute_map[r] for r in node.outputs]
if self._op_use_c_code: # float16 gets special treatment since running
try:
# float16 get special treatment since running
# unprepared C code will get bad results. # unprepared C code will get bad results.
if not getattr(self, '_f16_ok', False): if not getattr(self, '_f16_ok', False):
def is_f16(t): def is_f16(t):
...@@ -741,7 +720,6 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -741,7 +720,6 @@ class Op(utils.object2, PureOp, CLinkerOp):
"float16" % (self,)) "float16" % (self,))
raise NotImplementedError("float16") raise NotImplementedError("float16")
e = FunctionGraph(node.inputs, node.outputs) e = FunctionGraph(node.inputs, node.outputs)
e_no_recycling = [new_o e_no_recycling = [new_o
for (new_o, old_o) in zip(e.outputs, node.outputs) for (new_o, old_o) in zip(e.outputs, node.outputs)
if old_o in no_recycling] if old_o in no_recycling]
...@@ -763,14 +741,13 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -763,14 +741,13 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.outputs = node_output_storage rval.outputs = node_output_storage
rval.lazy = False rval.lazy = False
return rval return rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del rval
except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform')
# condition: either there was no c_code, or it failed def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
"""
Like make_thunk() but only makes python thunks.
"""
node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs]
p = node.op.perform p = node.op.perform
...@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp): ...@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval.lazy = False rval.lazy = False
return rval return rval
def make_thunk(self, node, storage_map, compute_map, no_recycling):
"""
:param node: something previously returned by self.make_node
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
logger = logging.getLogger('theano.gof.op.Op')
if self._op_use_c_code:
try:
return self.make_c_thunk(node, storage_map, compute_map,
no_recycling)
except (NotImplementedError, utils.MethodNotDefined):
logger.debug('Falling back on perform')
# condition: either there was no c_code, or it failed
return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
def get_test_value(v): def get_test_value(v):
""" """
......
...@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
] ]
@code_version((1,)) @code_version((2,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b='', dtype='float32'): b='', stride_b='', load_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer. """Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer :param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype) :param buf: buffer pointer of size warpSize * sizeof(dtype)
:param x: input data
:param stride_x: input data stride
:param load_x: wrapper to read from x
:param pos: index of executing thread :param pos: index of executing thread
:param count: number of executing threads :param count: number of executing threads
:param b: Optional, pointer to the bias :param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided :param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the output :param dtype: Optional, the dtype of the output
:param manner_fn: a function that accepts strings of arguments a :param manner_fn: a function that accepts strings of arguments a
...@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
""" """
if b: if b:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +" init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
" %(b)s[%(pos)s * %(stride_b)s]" % locals()) " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
loop_line = manner_fn("red", loop_line = manner_fn("red",
manner_init("%(x)s[i * %(stride_x)s] + " manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
"%(b)s[i * %(stride_b)s]" % "%(load_b)s(%(b)s[i * %(stride_b)s])" %
locals())) locals()))
else: else:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals()) init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" % loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf) "%s[i]" % buf)
...@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version(inline_reduce_fixed_shared.code_version) @code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, pos, count,
b='', stride_b='', dtype='float32'): b='', stride_b='', load_b='',
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, dtype='float32'):
return inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b), lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a, lambda a: a,
b, stride_b, dtype) b, stride_b, load_b, dtype)
@code_version((1,) + inline_reduce_max.code_version + @code_version((2,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version) inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x, def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
sm, sm_stride, sm, sm_stride, write_sm,
threadPos, threadCount, threadPos, threadCount,
b='', stride_b='', dtype="float32"): b='', stride_b='', load_b='',
dtype="float32"):
""" """
:param N: length of the buffer, atleast waprSize(32). :param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(dtype) :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param x: a ptr to the gpu memory where the row is stored :param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x :param stride_x: the stride between each element in x
:param load_x: wrapper to read from x
:param sm: a ptr to the gpu memory to store the result :param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element :param sm_stride: the stride between eash sm element
:param write_sm: wrapper before writing to sm
:param threadPos: index of executing thread :param threadPos: index of executing thread
:param threadCount: number of executing threads :param threadCount: number of executing threads
:param b: Optional, pointer to the bias :param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided :param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the softmax's output if not float32 :param dtype: Optional, the dtype of the softmax's output if not float32
:Precondition: buf is empty :Precondition: buf is empty
...@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
""" """
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
threadPos, threadCount, b, stride_b, threadPos, threadCount,
b, stride_b, load_b,
dtype), dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('npy_%s row_max = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount, inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b), lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a, lambda a: "exp(%s - row_max)" % a,
b, stride_b, dtype), b, stride_b, load_b, dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('npy_%s row_sum = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
...@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
if b: if b:
ret += [ ret += [
"%(sm)s[tx * %(sm_stride)s] = " "%(sm)s[tx * %(sm_stride)s] = "
" exp(%(x)s[tx * %(stride_x)s] +" " %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
" %(b)s[tx * %(stride_b)s] - row_max)" " %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
" / row_sum" % locals()] " / row_sum)" % locals()]
else: else:
ret += [ ret += [
"%(sm)s[tx * %(sm_stride)s] = " "%(sm)s[tx * %(sm_stride)s] = "
"exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()] "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
" / row_sum)" % locals()]
ret += [ ret += [
"}", "}",
'__syncthreads()', '__syncthreads()',
......
...@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor):
The helper methods like do_type_checking, copy_of_x, etc. specialize The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op. the c_code for this Op.
""" """
@property
def _f16_ok(self):
return self.iadd_node.op._f16_ok
def c_headers(self): def c_headers(self):
return self.iadd_node.op.c_headers() return self.iadd_node.op.c_headers()
...@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor):
PyGpuArrayObject* src){ PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL; PyGpuArrayObject* ret = NULL;
""" % locals() """ % locals()
# def c_code(self, node, name, inputs, outputs, sub):
inputs = ["dst", "src"] inputs = ["dst", "src"]
outputs = ["ret"] outputs = ["ret"]
sub = {"fail": "return NULL;"} sub = {"fail": "return NULL;"}
...@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor):
return ret return ret
def add_to_zview(self, nodename, x, fail): def add_to_zview(self, nodename, x, fail):
# TODO
return """ return """
PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s); PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
...@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
elemwise_version = self.iadd_node.c_code_cache_version() elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version: if not parent_version or not elemwise_version:
return return
return parent_version + elemwise_version + (1,) return parent_version + elemwise_version + (2,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
...@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def getInplElemwiseAdditionKernel(self, a, b): def getInplElemwiseAdditionKernel(self, a, b):
if a.dtype == 'float16' or b.dtype == 'float16':
raise NotImplementedError('float16 is not supported by pygpu '
'elemwise')
a_arg = pygpu.tools.as_argument(a, 'a') a_arg = pygpu.tools.as_argument(a, 'a')
b_arg = pygpu.tools.as_argument(b, 'b') b_arg = pygpu.tools.as_argument(b, 'b')
args = [a_arg, b_arg] args = [a_arg, b_arg]
...@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function """Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
""" """
_f16_ok = True
def __init__(self, inplace=False, set_instead_of_inc=False):
# The python implementation in the parent class is not applicable here
GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure """It defer from GpuAdvancedIncSubtensor1 in that it make sure
...@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
return """ return """
/*
* This is a version of atomicAdd that works for half-floats. It may
* read and write 2 bytes more than the size of the array if the array
* has an uneven number of elements. The actual value at that spot
* will not be modified.
*/
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
npy_uint32 old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410);
}
__global__ void k_vector_add_fast(int numRowsX, __global__ void k_vector_add_fast(int numRowsX,
int numColsX, int numColsX,
int stridesX0, int stridesX0,
......
...@@ -136,6 +136,12 @@ class GpuArrayType(Type): ...@@ -136,6 +136,12 @@ class GpuArrayType(Type):
raise NotImplementedError( raise NotImplementedError(
"GpuArrayType.values_eq_approx() don't implemented the" "GpuArrayType.values_eq_approx() don't implemented the"
" allow_remove_inf and allow_remove_nan parameter") " allow_remove_inf and allow_remove_nan parameter")
if a.dtype == 'float16' or b.dtype == 'float16':
an = numpy.asarray(a)
bn = numpy.asarray(b)
return tensor.TensorType.values_eq_approx(
an, bn, allow_remove_inf=allow_remove_inf,
allow_remove_nan=allow_remove_nan, rtol=rtol, atol=atol)
narrow = 'float32', 'complex64' narrow = 'float32', 'complex64'
if (str(a.dtype) in narrow) or (str(b.dtype) in narrow): if (str(a.dtype) in narrow) or (str(b.dtype) in narrow):
atol_ = theano.tensor.basic.float32_atol atol_ = theano.tensor.basic.float32_atol
...@@ -153,6 +159,13 @@ class GpuArrayType(Type): ...@@ -153,6 +159,13 @@ class GpuArrayType(Type):
locals()) locals())
return numpy.asarray(res).all() return numpy.asarray(res).all()
@staticmethod
def may_share_memory(a, b):
if (not isinstance(a, gpuarray.GpuArray) or
not isinstance(b, gpuarray.GpuArray)):
return False
return pygpu.gpuarray.may_share_memory(a, b)
def value_zeros(self, shape): def value_zeros(self, shape):
return pygpu.gpuarray.zeros(shape, dtype=self.typecode) return pygpu.gpuarray.zeros(shape, dtype=self.typecode)
......
...@@ -28,6 +28,7 @@ if cuda_available: ...@@ -28,6 +28,7 @@ if cuda_available:
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.fp16_help import write_w
def matVecModM(A, s, m): def matVecModM(A, s, m):
...@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base): ...@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base):
def perform(self, node, inp, out): def perform(self, node, inp, out):
rstate, size = inp rstate, size = inp
o_rstate, o_sample = out o_rstate, o_sample = out
numpy_version = numpy.__version__.split('.')
if (not self.warned_numpy_version and
int(numpy_version[0]) <= 1 and
int(numpy_version[1]) < 3):
print("Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy")
self.warned_numpy_version = True
n_elements = 1 n_elements = 1
rstate = numpy.asarray(rstate) # bring state from GPU if necessary rstate = numpy.asarray(rstate) # bring state from GPU if necessary
...@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base): ...@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
rstate, size = inp rstate, size = inp
# If we try to use the C code here with something else than a
# TensorType, something is wrong (likely one of the GPU ops
# not defining C code correctly).
assert isinstance(node.inputs[0].type, TensorType)
o_rstate, o_sample = out o_rstate, o_sample = out
if self.inplace: if self.inplace:
o_rstate_requirement = 'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED' o_rstate_requirement = 'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED'
...@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version # GpuArray version
_f16_ok = True
@classmethod @classmethod
def new(cls, rstate, ndim, dtype, size): def new(cls, rstate, ndim, dtype, size):
...@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h'] return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
if self.output_type.dtype == 'float32': write = write_w(self.output_type.dtype)
if self.output_type.dtype == 'float16':
otype = 'ga_half'
# limit the values of the state that we use.
mask = '& 0x7fff'
NORM = '3.0518e-05f' # numpy.float16(1.0/(2**15+8))
# this was determined by finding the biggest number such that
# numpy.float16(number * (M1 & 0x7fff)) < 1.0
elif self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
mask = ''
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65)) NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that # this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0 # numpy.float32(number * M1) < 1.0
else: elif self.output_type.dtype == 'float64':
otype = 'double' otype = 'double'
mask = ''
NORM = '4.656612873077392578125e-10' NORM = '4.656612873077392578125e-10'
else:
raise ValueError('Unsupported data type for output',
self.output_type.dtype)
code = """ code = """
KERNEL void mrg_uniform( KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data, GLOBAL_MEM %(otype)s *sample_data,
...@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x21 = y2; x21 = y2;
if (x11 <= x21) { if (x11 <= x21) {
sample_data[i] = (x11 - x21 + M1) * %(NORM)s; sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
} }
else else
{ {
sample_data[i] = (x11 - x21) * %(NORM)s; sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
} }
} }
...@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail'] fail = sub['fail']
kname = self.gpu_kernels(node, nodename)[0].objvar kname = self.gpu_kernels(node, nodename)[0].objvar
otypecode = str(self.output_type.typecode)
if self.output_type.dtype == 'float32':
otype = 'float'
otypecode = 'GA_FLOAT'
else:
otype = 'double'
otypecode = 'GA_DOUBLE'
return """ return """
//////// <code generated by mrg_uniform>
size_t odims[%(ndim)s]; size_t odims[%(ndim)s];
unsigned int n_elements = 1; unsigned int n_elements = 1;
unsigned int n_streams; unsigned int n_streams;
...@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)s %(fail)s
} }
} }
//////// </ code generated by mrg_uniform>
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (3, self.GpuKernelBase_version) return (6, self.GpuKernelBase_version)
def guess_n_streams(size, warn=False): def guess_n_streams(size, warn=False):
......
...@@ -3842,6 +3842,7 @@ class Reshape(Op): ...@@ -3842,6 +3842,7 @@ class Reshape(Op):
The number of dimensions to which to reshape to (ndim) must be The number of dimensions to which to reshape to (ndim) must be
known at graph build time.""" known at graph build time."""
view_map = {0: [0]} # output 0 is potentially aliased to inputs [0] view_map = {0: [0]} # output 0 is potentially aliased to inputs [0]
_f16_ok = True
check_input = False check_input = False
......
...@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp): ...@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
# We add boundary checks prevent exp from generating inf or # We add boundary checks prevent exp from generating inf or
# 0. The reset of the logic always generate 0 or 1 in those # 0. The reset of the logic always generate 0 or 1 in those
# cases. This is a speed optimization. # cases. This is a speed optimization.
# The constants were obtained by looking at the output of python commands like: # The constants were obtained by looking at the output of
""" # python commands like:
import numpy, theano #
dt='float32' # or float64 # import numpy, theano
for i in xrange(750): # dt='float32' # or float64
print i, repr(theano._asarray(1.0, dtype=dt) / # for i in xrange(750):
(theano._asarray(1.0, dtype=dt) + # print i, repr(theano._asarray(1.0, dtype=dt) /
numpy.exp(-theano._asarray([i,-i], dtype=dt)))) # (theano._asarray(1.0, dtype=dt) +
""" # numpy.exp(-theano._asarray([i,-i], dtype=dt))))
if node.inputs[0].type == scalar.float32:
# float16 limits: -11.0, 7.0f
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if (node.inputs[0].type == scalar.float32 or
node.inputs[0].type == scalar.float16):
return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals() return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
elif node.inputs[0].type == scalar.float64: elif node.inputs[0].type == scalar.float64:
return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals() return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
...@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp): ...@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
x, = inp x, = inp
z, = out z, = out
if node.inputs[0].type == scalar.float32: # These constants were obtained by looking at the output of
# These constants were obtained by looking at the output of python commands like: # python commands like:
# for i in xrange(750): # for i in xrange(750):
# print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt)))) # print i, repr(numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# the boundary checks prevent us from generating inf # the boundary checks prevent us from generating inf
# float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if (node.inputs[0].type == scalar.float32 or
node.inputs[0].type == scalar.float16):
return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals() return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
elif node.inputs[0].type == scalar.float64: elif node.inputs[0].type == scalar.float64:
return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals() return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
......
...@@ -5151,7 +5151,8 @@ def local_log_erfc(node): ...@@ -5151,7 +5151,8 @@ def local_log_erfc(node):
T.log(1 - 1 / (2 * x ** 2) + 3 / (4 * x ** 4) T.log(1 - 1 / (2 * x ** 2) + 3 / (4 * x ** 4)
- 15 / (8 * x ** 6))) - 15 / (8 * x ** 6)))
if node.outputs[0].dtype == 'float32': if (node.outputs[0].dtype == 'float32' or
node.outputs[0].dtype == 'float16'):
threshold = 10.0541949 threshold = 10.0541949
elif node.outputs[0].dtype == 'float64': elif node.outputs[0].dtype == 'float64':
threshold = 26.641747557 threshold = 26.641747557
...@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node): ...@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node):
3 / (4 * (x ** 4)) - 15 / (8 * (x ** 6)), -1) 3 / (4 * (x ** 4)) - 15 / (8 * (x ** 6)), -1)
* T.cast(T.sqrt(numpy.pi), dtype=x.dtype)) * T.cast(T.sqrt(numpy.pi), dtype=x.dtype))
if x.dtype == 'float32': if x.dtype == 'float32' or x.dtype == 'float16':
threshold = 9.3 threshold = 9.3
#threshold = 10.1 #threshold = 10.1
elif x.dtype == 'float64': elif x.dtype == 'float64':
......
...@@ -291,6 +291,7 @@ class Subtensor(Op): ...@@ -291,6 +291,7 @@ class Subtensor(Op):
debug = 0 debug = 0
check_input = False check_input = False
view_map = {0: [0]} view_map = {0: [0]}
_f16_ok = True
@staticmethod @staticmethod
def collapse(idxs, cond): def collapse(idxs, cond):
...@@ -328,7 +329,7 @@ class Subtensor(Op): ...@@ -328,7 +329,7 @@ class Subtensor(Op):
TODO: WRITEME: This method also accepts "entry" already being a Type; TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen? when would that happen?
""" """
invalid_scal_types = [scal.float64, scal.float32] invalid_scal_types = [scal.float64, scal.float32, scal.float16]
scal_types = [scal.int64, scal.int32, scal.int16, scal.int8] scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
tensor_types = [theano.tensor.lscalar, theano.tensor.iscalar, tensor_types = [theano.tensor.lscalar, theano.tensor.iscalar,
theano.tensor.wscalar, theano.tensor.bscalar] theano.tensor.wscalar, theano.tensor.bscalar]
...@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op): ...@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op):
# sparse_grad doesn't go in here since it only affects the output # sparse_grad doesn't go in here since it only affects the output
# of the grad() method. # of the grad() method.
__props__ = () __props__ = ()
_f16_ok = True
def __init__(self, sparse_grad=False): def __init__(self, sparse_grad=False):
self.sparse_grad = sparse_grad self.sparse_grad = sparse_grad
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论