提交 37d5f777 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5049 from abergeron/fix_dlt_f16

Collection of fixes to make the DLT work in float16
...@@ -10,6 +10,15 @@ from theano.configparser import config ...@@ -10,6 +10,15 @@ from theano.configparser import config
import theano.tensor as T import theano.tensor as T
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
from theano.compile import Mode from theano.compile import Mode
from .mode import get_mode
try:
from theano.gpuarray.type import GpuArrayType, _name_for_ctx
from pygpu.gpuarray import GpuArray
pygpu_available = True
except ImportError:
pygpu_available = False
logger = logging.getLogger("theano.compile.nanguardmode") logger = logging.getLogger("theano.compile.nanguardmode")
...@@ -86,6 +95,8 @@ def contains_nan(arr, node=None, var=None): ...@@ -86,6 +95,8 @@ def contains_nan(arr, node=None, var=None):
else: else:
compile_gpu_func(True, False, False) compile_gpu_func(True, False, False)
return np.isnan(f_gpumin(arr.reshape(arr.size))) return np.isnan(f_gpumin(arr.reshape(arr.size)))
elif pygpu_available and isinstance(arr, GpuArray):
return np.isnan(f_gpua_min(arr.reshape(arr.size)))
return np.isnan(np.min(arr)) return np.isnan(np.min(arr))
...@@ -136,6 +147,9 @@ def contains_inf(arr, node=None, var=None): ...@@ -136,6 +147,9 @@ def contains_inf(arr, node=None, var=None):
compile_gpu_func(False, True, False) compile_gpu_func(False, True, False)
return (np.isinf(f_gpumin(arr.reshape(arr.size))) or return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
np.isinf(f_gpumax(arr.reshape(arr.size)))) np.isinf(f_gpumax(arr.reshape(arr.size))))
elif pygpu_available and isinstance(arr, GpuArray):
return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or
np.isinf(f_gpua_max(arr.reshape(arr.size))))
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr)) return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
...@@ -187,6 +201,27 @@ def compile_gpu_func(nan_is_error, inf_is_error, big_is_error): ...@@ -187,6 +201,27 @@ def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
cuda_compile_failed = True cuda_compile_failed = True
def f_compute(op):
def result(inp):
dtype = inp.dtype
ctx_name = _name_for_ctx(inp.context)
key = (dtype, ctx_name)
f = result.cache.get(key, None)
if f is None:
guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
mode = get_mode('FAST_RUN').including('gpuarray')
f = theano.function([guard_in], op(guard_in),
mode=mode, profile=False)
result.cache[key] = f
return f(inp)
result.cache = dict()
return result
f_gpua_min = f_compute(T.min)
f_gpua_max = f_compute(T.max)
f_gpua_absmax = f_compute(lambda x: T.max(T.abs_(x)))
class NanGuardMode(Mode): class NanGuardMode(Mode):
""" """
A Theano compilation Mode that makes the compiled function automatically A Theano compilation Mode that makes the compiled function automatically
...@@ -220,7 +255,9 @@ class NanGuardMode(Mode): ...@@ -220,7 +255,9 @@ class NanGuardMode(Mode):
big_is_error = config.NanGuardMode.big_is_error big_is_error = config.NanGuardMode.big_is_error
assert nan_is_error or inf_is_error or big_is_error assert nan_is_error or inf_is_error or big_is_error
compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
if cuda.cuda_enabled:
compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
def do_check_on(value, nd, var=None): def do_check_on(value, nd, var=None):
""" """
...@@ -260,7 +297,10 @@ class NanGuardMode(Mode): ...@@ -260,7 +297,10 @@ class NanGuardMode(Mode):
elif value.size == 0: elif value.size == 0:
err = False err = False
elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray): elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
compile_gpu_func(False, False, True)
err = (f_gpuabsmax(value.reshape(value.size)) > 1e10) err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
elif pygpu_available and isinstance(value, GpuArray):
err = (f_gpua_absmax(value.reshape(value.size)) > 1e10)
else: else:
err = (np.abs(value).max() > 1e10) err = (np.abs(value).max() > 1e10)
if err: if err:
......
...@@ -445,7 +445,7 @@ def shape_i(var, i, fgraph=None): ...@@ -445,7 +445,7 @@ def shape_i(var, i, fgraph=None):
shape_of = shape_feature.shape_of shape_of = shape_feature.shape_of
def recur(node): def recur(node):
if not hasattr(node.outputs[0], 'fgraph'): if not node.outputs[0] in shape_of:
for inp in node.inputs: for inp in node.inputs:
if inp.owner: if inp.owner:
recur(inp.owner) recur(inp.owner)
......
...@@ -446,7 +446,7 @@ if param and os.name == 'nt': ...@@ -446,7 +446,7 @@ if param and os.name == 'nt':
def warn_cxx(val): def warn_cxx(val):
"""We only support clang++ as otherwise we hit strange g++/OSX bugs.""" """We only support clang++ as otherwise we hit strange g++/OSX bugs."""
if sys.platform == 'darwin' and val != 'clang++': if sys.platform == 'darwin' and 'clang++' not in val:
_logger.warning("Only clang++ is supported. With g++," _logger.warning("Only clang++ is supported. With g++,"
" we end up with strange g++/OSX bugs.") " we end up with strange g++/OSX bugs.")
return True return True
......
...@@ -66,7 +66,9 @@ def init_dev(dev, name=None): ...@@ -66,7 +66,9 @@ def init_dev(dev, name=None):
single_stream=config.gpuarray.single_stream, single_stream=config.gpuarray.single_stream,
sched=config.gpuarray.sched) sched=config.gpuarray.sched)
init_dev.devmap[dev] = ctx init_dev.devmap[dev] = ctx
if config.gpuarray.preallocate > 0: if config.gpuarray.preallocate < 0:
print("Disabling allocation cache on %s" % (dev,))
elif config.gpuarray.preallocate > 0:
MB = (1024 * 1024) MB = (1024 * 1024)
if config.gpuarray.preallocate <= 1: if config.gpuarray.preallocate <= 1:
gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem
......
...@@ -1319,8 +1319,6 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1319,8 +1319,6 @@ class GpuDnnSoftmaxBase(DnnBase):
DnnBase.__init__(self, [self.file], self.c_func) DnnBase.__init__(self, [self.file], self.c_func)
assert(algo in ('fast', 'accurate', 'log')) assert(algo in ('fast', 'accurate', 'log'))
if algo == 'log' and version(raises=False) < 3000:
raise RuntimeError("Need cuDNN v3 for log-softmax")
self.algo = algo self.algo = algo
assert(mode in ('instance', 'channel')) assert(mode in ('instance', 'channel'))
...@@ -1361,6 +1359,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1361,6 +1359,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
or per spatial location '01' per image across 'c'. or per spatial location '01' per image across 'c'.
""" """
_f16_ok = True
direction = "forward" direction = "forward"
file = "dnn_softmax.c" file = "dnn_softmax.c"
c_func = "APPLY_SPECIFIC(softmax)" c_func = "APPLY_SPECIFIC(softmax)"
...@@ -1397,6 +1396,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1397,6 +1396,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
image across 'c'. image across 'c'.
""" """
_f16_ok = True
direction = 'backward' direction = 'backward'
file = "dnn_softmax_grad.c" file = "dnn_softmax_grad.c"
c_func = "APPLY_SPECIFIC(softmax_grad)" c_func = "APPLY_SPECIFIC(softmax_grad)"
......
...@@ -33,6 +33,12 @@ def as_C_string_const(s): ...@@ -33,6 +33,12 @@ def as_C_string_const(s):
for l in s.split('\n')) for l in s.split('\n'))
def get_scal(dt):
if dt == 'float16':
dt = 'float32'
return scalar.get_scalar_type(dt)
class GpuElemwise(HideC, Elemwise): class GpuElemwise(HideC, Elemwise):
""" """
Elemwise on the GPU. Elemwise on the GPU.
...@@ -60,23 +66,18 @@ class GpuElemwise(HideC, Elemwise): ...@@ -60,23 +66,18 @@ class GpuElemwise(HideC, Elemwise):
zip(out_info[0], out_info[1])] zip(out_info[0], out_info[1])]
if len(outputs) > 1: if len(outputs) > 1:
raise NotImplementedError() raise NotImplementedError()
node = Apply(self, inputs, outputs)
# Try to generate the kernel to catch SupportCodeErrors # Try to generate the kernel to catch SupportCodeErrors
scal_ins = [get_scal(i.dtype) for i in inputs]
fake_node = self.scalar_op.make_node(*[i() for i in scal_ins])
try: try:
scal_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs] code = fake_node.op.c_support_code_apply(fake_node, "test")
scal_out = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out])
code = self.scalar_op.c_support_code_apply(fake_node, "test")
if code: if code:
raise SupportCodeError(code) raise SupportCodeError(code)
except MethodNotDefined: except MethodNotDefined:
pass pass
try: try:
support_code = self.scalar_op.c_support_code() support_code = fake_node.op.c_support_code()
if "struct" in support_code: if "struct" in support_code:
# The macro is fine, the C++ struct is not. # The macro is fine, the C++ struct is not.
raise SupportCodeError( raise SupportCodeError(
...@@ -85,6 +86,15 @@ class GpuElemwise(HideC, Elemwise): ...@@ -85,6 +86,15 @@ class GpuElemwise(HideC, Elemwise):
except MethodNotDefined: except MethodNotDefined:
pass pass
if fake_node.op != self.scalar_op:
# If the new op is different due to type changes, we make a new
# op for it.
elem = GpuElemwise(fake_node.op, self.inplace_pattern, self.name,
self.nfunc_spec, self.openmp)
else:
elem = self
node = Apply(elem, inputs, outputs)
return node return node
def get_params(self, node): def get_params(self, node):
...@@ -92,59 +102,31 @@ class GpuElemwise(HideC, Elemwise): ...@@ -92,59 +102,31 @@ class GpuElemwise(HideC, Elemwise):
def _get_vnames(self, node): def _get_vnames(self, node):
inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)] inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
outs = ['o%d' % (n,) for n, _ in enumerate(node.outputs) if n not in self.inplace_pattern] outs = ['o%d' % (n,) if n not in self.inplace_pattern else
inps[self.inplace_pattern[n]]
for n, _ in enumerate(node.outputs)]
return inps, outs return inps, outs
def _generate_op_string(self, node): def _generate_op_string(self, node):
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
inps, outs = self._get_vnames(node) inps, outs = self._get_vnames(node)
scal_v_ins = [get_scal(i.dtype)() for i in node.inputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins], fake_node = self.scalar_op.make_node(*scal_v_ins)
[o() for o in scal_v_outs]) scal_v_out = fake_node.outputs
assert len(scal_v_out) == len(node.outputs)
scal_in = [i if si.dtype != 'float16' else kop = fake_node.op.c_code(fake_node, 'elem_scalar',
'load_half(&' + i + ')' for i, si in zip(inps, scal_v_ins)] inps, outs,
dict(fail='return;'))
scal_out = [] # Some ops like cast will reintroduce float16 in the internal graph.
oi = 0 kop = kop.replace('npy_float16', 'ga_float')
scal_f16 = []
for n in range(len(node.outputs)):
if n in self.inplace_pattern:
arg = inps[self.inplace_pattern[n]]
else:
arg = outs[oi]
oi += 1
if node.outputs[n].dtype == 'float16':
scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg))
scal_out.append(scal_f16[-1][0])
else:
scal_out.append(arg)
kop = self.scalar_op.c_code(fake_node, 'elem_scalar',
scal_in, scal_out,
dict(fail='return;'))
if scal_f16:
# if we have float16 scalars on output we have to wrap
# them and insert a stand-in float32 variable since
# float16 arithemtic is not available
code = ["{"]
for f in scal_f16:
code.append('ga_float %s;' % (f[0],))
# XXX: The replace is an ugly hack to make sure temp
# variables inthe middle are float32
code.append(kop.replace('npy_float16', 'ga_float'))
for f in scal_f16:
code.append('store_half(&%s, %s);' % (f[1], f[0]))
code.append('}')
kop = '\n'.join(code)
support_code = "" support_code = ""
try: try:
# We accept only some c_support_code(). # We accept only some c_support_code().
# This filter is done in the make_node() # This filter is done in the make_node()
support_code += self.scalar_op.c_support_code() support_code += fake_node.op.c_support_code()
except MethodNotDefined: except MethodNotDefined:
pass pass
for npy, ga in [("npy_uint8", "ga_ubyte"), for npy, ga in [("npy_uint8", "ga_ubyte"),
...@@ -171,7 +153,7 @@ class GpuElemwise(HideC, Elemwise): ...@@ -171,7 +153,7 @@ class GpuElemwise(HideC, Elemwise):
def c_init_code_struct(self, node, name, sub): def c_init_code_struct(self, node, name, sub):
inps, outs = self._get_vnames(node) inps, outs = self._get_vnames(node)
nargs = len(inps) + len(outs) nargs = len(inps) + len(outs) - len(self.inplace_pattern)
support_code, kop = self._generate_op_string(node) support_code, kop = self._generate_op_string(node)
res = """ res = """
gpuelemwise_arg args[%(nargs)s] = {{0}}; gpuelemwise_arg args[%(nargs)s] = {{0}};
...@@ -185,24 +167,22 @@ class GpuElemwise(HideC, Elemwise): ...@@ -185,24 +167,22 @@ class GpuElemwise(HideC, Elemwise):
""" % dict(n=n, name='"%s"' % (name,), """ % dict(n=n, name='"%s"' % (name,),
typecode=i.type.typecode) typecode=i.type.typecode)
p = 0 p = len(inps)
for n, o in enumerate(node.outputs): for n, o in enumerate(node.outputs):
if n in self.inplace_pattern: if n in self.inplace_pattern:
assert(len(node.outputs) == 1) assert(len(node.outputs) == 1)
res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n]) res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
else: else:
nn = len(inps) + p
name = outs[p]
p += 1
res += """ res += """
args[%(n)s].name = %(name)s; args[%(n)s].name = %(name)s;
args[%(n)s].typecode = %(typecode)s; args[%(n)s].typecode = %(typecode)s;
args[%(n)s].flags = GE_WRITE; args[%(n)s].flags = GE_WRITE;
""" % dict(n=nn, name='"%s"' % (name,), """ % dict(n=p, name='"%s"' % (outs[n],),
typecode=o.type.typecode) typecode=o.type.typecode)
p += 1
res += """ res += """
ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0); ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, GE_CONVERT_F16);
if (ge == NULL) { if (ge == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support"); PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
%(fail)s %(fail)s
...@@ -363,7 +343,7 @@ class GpuElemwise(HideC, Elemwise): ...@@ -363,7 +343,7 @@ class GpuElemwise(HideC, Elemwise):
def c_code_cache_version(self): def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version() ver = self.scalar_op.c_code_cache_version()
if ver: if ver:
return (7, ver) return (8, ver)
else: else:
return ver return ver
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import os
import numpy import numpy
from theano import Op, Apply, config from theano import Op, Apply, config
...@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return node.inputs[0].type.context return node.inputs[0].type.context
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', 'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
...@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda': if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only') raise NotImplementedError('cuda only')
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
...@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp x, b, y_idx = inp
nll, sm, am = out nll, sm, am = out
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_nll = node.outputs[0].dtype
dtype_sm = node.outputs[1].dtype
dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals() k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
...@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
""" % locals() """ % locals()
sio = StringIO() sio = StringIO()
print(""" print("""
if (PyGpuArray_NDIM(%(y_idx)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (PyGpuArray_DIMS(%(x)s)[0] != if (PyGpuArray_DIMS(%(x)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]) PyGpuArray_DIMS(%(y_idx)s)[0])
{ {
...@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
"dimension mismatch in x,b arguments"); "dimension mismatch in x,b arguments");
%(fail)s; %(fail)s;
} }
if ((NULL == %(nll)s) //initial condition if (theano_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
|| (PyGpuArray_DIMS(%(nll)s)[0] != if (theano_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
PyGpuArray_DIMS(%(y_idx)s)[0])) if (theano_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
{
Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_x)s, GA_C_ORDER, %(ctx)s,
Py_None);
if (!%(nll)s) {
%(fail)s
}
}
if ((NULL == %(sm)s)
|| (PyGpuArray_DIMS(%(sm)s)[0] !=
PyGpuArray_DIMS(%(x)s)[0])
|| (PyGpuArray_DIMS(%(sm)s)[1] !=
PyGpuArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode_b)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should do it up
%(fail)s;
}
}
if ((NULL == %(am)s)
|| (PyGpuArray_DIMS(%(am)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_y_idx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll and sm,
// the cleanup code should do it up
%(fail)s;
}
}
{ {
size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096); size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256); size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
size_t n_shared = n_threads * %(worksize_x)s; size_t n_shared = n_threads * %(worksize_x)s;
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
void *kernel_params[] = { int err = k_xent_sm_1hot_bias_call(
(void *)&PyGpuArray_DIMS(%(x)s)[0], 1, &n_blocks, &n_threads, n_shared,
(void *)&PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[0],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset, PyGpuArray_DIMS(%(x)s)[1],
(void *)&stride_X0, (void *)&stride_X1, %(x)s->ga.data, %(x)s->ga.offset,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset, PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
(void *)&stride_B0, PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
(void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset, %(b)s->ga.data, %(b)s->ga.offset,
(void *)&stride_YIDX0, PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
(void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset, %(y_idx)s->ga.data, %(y_idx)s->ga.offset,
(void *)&stride_NLL0, PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
(void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset, %(nll)s->ga.data, %(nll)s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1, PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
(void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset, %(sm)s->ga.data, %(sm)s->ga.offset,
(void *)&stride_AM0}; PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params); PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
%(am)s->ga.data, %(am)s->ga.offset,
PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,) return (12,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......
...@@ -797,6 +797,25 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -797,6 +797,25 @@ class test_SoftMax(test_nnet.test_SoftMax):
def test_softmax_shape_0(self): def test_softmax_shape_0(self):
raise SkipTest("Cudnn doesn't support 0 shapes") raise SkipTest("Cudnn doesn't support 0 shapes")
def test_softmax_f16(self):
x = T.matrix('x', 'float16')
x_gpu = T.tensor4('x_gpu', 'float16')
f_z = T.nnet.softmax_op
f_gpu = dnn.GpuDnnSoftmax(
'accurate',
'channel'
)
def cmp(n, m, f, f_gpu):
data = numpy.random.random((n, m)).astype('float16')
gdata = numpy.asarray(data)[:, :, None, None]
out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
utt.assert_allclose(out, gout)
self._test_softmax(x, x_gpu, f_z, f_gpu, cmp)
def test_softmax_grad(self): def test_softmax_grad(self):
def cmp(n, m, f, f_gpu): def cmp(n, m, f, f_gpu):
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
......
...@@ -1373,10 +1373,10 @@ class numeric_grad(object): ...@@ -1373,10 +1373,10 @@ class numeric_grad(object):
# perfectly accurate. # perfectly accurate.
type_eps = {'float64': 1e-7, type_eps = {'float64': 1e-7,
'float32': 3e-4, 'float32': 3e-4,
'float16': 1e-3, 'float16': 1e-1,
numpy.dtype('float64'): 1e-7, numpy.dtype('float64'): 1e-7,
numpy.dtype('float32'): 3e-4, numpy.dtype('float32'): 3e-4,
numpy.dtype('float16'): 1e-3} numpy.dtype('float16'): 1e-1}
def __init__(self, f, pt, eps=None, out_type=None): def __init__(self, f, pt, eps=None, out_type=None):
"""Return the gradient of f at pt. """Return the gradient of f at pt.
......
...@@ -39,7 +39,7 @@ builtin_int = int ...@@ -39,7 +39,7 @@ builtin_int = int
builtin_float = float builtin_float = float
class ComplexError(Exception): class ComplexError(NotImplementedError):
""" """
Raised if complex numbers are used in an unsupported operation. Raised if complex numbers are used in an unsupported operation.
...@@ -2197,7 +2197,7 @@ class Sgn(UnaryScalarOp): ...@@ -2197,7 +2197,7 @@ class Sgn(UnaryScalarOp):
return '%(z)s = (%(x)s > 0) ? 1. : ((%(x)s < 0) ? -1. : (isnan(%(x)s) ? NAN : 0.));' % locals() return '%(z)s = (%(x)s > 0) ? 1. : ((%(x)s < 0) ? -1. : (isnan(%(x)s) ? NAN : 0.));' % locals()
if type in int_types: if type in int_types:
return "%(z)s = (%(x)s >= 0) ? (%(x)s == 0) ? 0 : 1 : -1;" % locals() return "%(z)s = (%(x)s >= 0) ? (%(x)s == 0) ? 0 : 1 : -1;" % locals()
raise TypeError() # complex has no sgn raise ComplexError('complex has no sgn')
def c_code_cache_version(self): def c_code_cache_version(self):
s = super(Sgn, self).c_code_cache_version() s = super(Sgn, self).c_code_cache_version()
...@@ -2300,7 +2300,7 @@ class RoundHalfToEven(UnaryScalarOp): ...@@ -2300,7 +2300,7 @@ class RoundHalfToEven(UnaryScalarOp):
(z,) = outputs (z,) = outputs
typ = node.outputs[0].type.dtype typ = node.outputs[0].type.dtype
if typ not in ['float32', 'float64']: if typ not in ['float32', 'float64']:
Exception("The output should be float32 or float64") raise NotImplementedError("The output should be float32 or float64")
return dedent(""" return dedent("""
#ifndef ROUNDING_EPSILON #ifndef ROUNDING_EPSILON
...@@ -2398,7 +2398,7 @@ class RoundHalfAwayFromZero(UnaryScalarOp): ...@@ -2398,7 +2398,7 @@ class RoundHalfAwayFromZero(UnaryScalarOp):
if node.outputs[0].type.dtype in ['float32', 'float64']: if node.outputs[0].type.dtype in ['float32', 'float64']:
return "%(z)s = round(%(x)s);" % locals() return "%(z)s = round(%(x)s);" % locals()
else: else:
Exception("The output should be float32 or float64") raise NotImplementedError("The output should be float32 or float64")
round_half_away_from_zero = RoundHalfAwayFromZero(same_out_float_only) round_half_away_from_zero = RoundHalfAwayFromZero(same_out_float_only)
...@@ -3711,8 +3711,7 @@ class Composite(ScalarOp): ...@@ -3711,8 +3711,7 @@ class Composite(ScalarOp):
raise NotImplementedError("grad is not implemented for Composite") raise NotImplementedError("grad is not implemented for Composite")
def c_code(self, node, nodename, inames, onames, sub): def c_code(self, node, nodename, inames, onames, sub):
if not hasattr(self, '_c_code'): self.init_c_code()
self.init_c_code()
d = dict(chain(izip(("i%i" % i for i in xrange(len(inames))), inames), d = dict(chain(izip(("i%i" % i for i in xrange(len(inames))), inames),
izip(("o%i" % i for i in xrange(len(onames))), izip(("o%i" % i for i in xrange(len(onames))),
...@@ -3746,6 +3745,7 @@ class Composite(ScalarOp): ...@@ -3746,6 +3745,7 @@ class Composite(ScalarOp):
return "\n".join(sorted(set(rval))) return "\n".join(sorted(set(rval)))
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
self.init_c_code()
rval = [] rval = []
for subnode, subnodename in zip(self.fgraph.toposort(), self.nodenames): for subnode, subnodename in zip(self.fgraph.toposort(), self.nodenames):
try: try:
...@@ -3771,13 +3771,11 @@ class Composite(ScalarOp): ...@@ -3771,13 +3771,11 @@ class Composite(ScalarOp):
return False return False
# see __hash__ for comment on why there is no mention of fgraph # see __hash__ for comment on why there is no mention of fgraph
# or module cache key here. # or module cache key here.
if not hasattr(self, '_c_code'): self.init_c_code() # self._c_code and self.nodenames
self.init_c_code() # self._c_code and self.nodenames
return (self._c_code == other._c_code) return (self._c_code == other._c_code)
def __hash__(self): def __hash__(self):
if not hasattr(self, '_c_code'): self.init_c_code() # self._c_code and self.nodenames
self.init_c_code() # self._c_code and self.nodenames
rval = hash((type(self), rval = hash((type(self),
self.nin, self.nin,
self.nout, self.nout,
......
...@@ -2774,6 +2774,7 @@ class Alloc(gof.Op): ...@@ -2774,6 +2774,7 @@ class Alloc(gof.Op):
are lifted, the first argument to fill can often be pruned from the graph. are lifted, the first argument to fill can often be pruned from the graph.
""" """
_f16_ok = True
__props__ = () __props__ = ()
def validate_shape(self, shape): def validate_shape(self, shape):
......
...@@ -352,7 +352,7 @@ class ScalarSoftplus(scalar.UnaryScalarOp): ...@@ -352,7 +352,7 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
# float16 limits: -17.0, 6.0 # float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the # We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway. # computation will happen in float32 anyway.
if (node.inputs[0].type == scalar.float32 or if (node.inputs[0].type == scalar.float32 or
node.inputs[0].type == scalar.float16): node.inputs[0].type == scalar.float16):
return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals() return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
......
...@@ -2247,7 +2247,7 @@ class Assert(T.Op): ...@@ -2247,7 +2247,7 @@ class Assert(T.Op):
>>> func = theano.function([x], assert_op(x, x.size<2)) >>> func = theano.function([x], assert_op(x, x.size<2))
""" """
_f16_ok = True
__props__ = ('msg',) __props__ = ('msg',)
view_map = {0: [0]} view_map = {0: [0]}
...@@ -6063,20 +6063,24 @@ def local_log1p(node): ...@@ -6063,20 +6063,24 @@ def local_log1p(node):
log_arg.owner.inputs, only_process_constants=True) log_arg.owner.inputs, only_process_constants=True)
# scalar_inputs are potentially dimshuffled and fill'd scalars # scalar_inputs are potentially dimshuffled and fill'd scalars
if scalars and numpy.allclose(numpy.sum(scalars), 1): if scalars and numpy.allclose(numpy.sum(scalars), 1):
if not nonconsts: if nonconsts:
pass # leave for constant-merge if len(nonconsts) > 1:
if len(nonconsts) == 1: ninp = T.add(*nonconsts)
return _fill_chain(T.log1p(nonconsts[0]), scalar_inputs) else:
else: ninp = nonconsts[0]
return _fill_chain(T.log1p(T.add(*nonconsts)), if ninp.dtype != log_arg.type.dtype:
scalar_inputs) ninp = ninp.astype(node.outputs[0].dtype)
return _fill_chain(T.log1p(ninp), scalar_inputs)
elif log_arg.owner and log_arg.owner.op == T.sub: elif log_arg.owner and log_arg.owner.op == T.sub:
one = T.extract_constant(log_arg.owner.inputs[0], one = T.extract_constant(log_arg.owner.inputs[0],
only_process_constants=True) only_process_constants=True)
if one != 1: if one != 1:
return return
return [T.log1p(T.neg(log_arg.owner.inputs[1]))] other = log_arg.owner.inputs[1]
if other.dtype != log_arg.dtype:
other = other.astype(log_arg.dtype)
return [T.log1p(T.neg(other))]
# TODO: in canonicalize, change log10 and log2 -> log # TODO: in canonicalize, change log10 and log2 -> log
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论