提交 37d5f777 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5049 from abergeron/fix_dlt_f16

Collection of fixes to make the DLT work in float16
......@@ -10,6 +10,15 @@ from theano.configparser import config
import theano.tensor as T
import theano.sandbox.cuda as cuda
from theano.compile import Mode
from .mode import get_mode
try:
from theano.gpuarray.type import GpuArrayType, _name_for_ctx
from pygpu.gpuarray import GpuArray
pygpu_available = True
except ImportError:
pygpu_available = False
logger = logging.getLogger("theano.compile.nanguardmode")
......@@ -86,6 +95,8 @@ def contains_nan(arr, node=None, var=None):
else:
compile_gpu_func(True, False, False)
return np.isnan(f_gpumin(arr.reshape(arr.size)))
elif pygpu_available and isinstance(arr, GpuArray):
return np.isnan(f_gpua_min(arr.reshape(arr.size)))
return np.isnan(np.min(arr))
......@@ -136,6 +147,9 @@ def contains_inf(arr, node=None, var=None):
compile_gpu_func(False, True, False)
return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
np.isinf(f_gpumax(arr.reshape(arr.size))))
elif pygpu_available and isinstance(arr, GpuArray):
return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or
np.isinf(f_gpua_max(arr.reshape(arr.size))))
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
......@@ -187,6 +201,27 @@ def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
cuda_compile_failed = True
def f_compute(op):
def result(inp):
dtype = inp.dtype
ctx_name = _name_for_ctx(inp.context)
key = (dtype, ctx_name)
f = result.cache.get(key, None)
if f is None:
guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
mode = get_mode('FAST_RUN').including('gpuarray')
f = theano.function([guard_in], op(guard_in),
mode=mode, profile=False)
result.cache[key] = f
return f(inp)
result.cache = dict()
return result
f_gpua_min = f_compute(T.min)
f_gpua_max = f_compute(T.max)
f_gpua_absmax = f_compute(lambda x: T.max(T.abs_(x)))
class NanGuardMode(Mode):
"""
A Theano compilation Mode that makes the compiled function automatically
......@@ -220,7 +255,9 @@ class NanGuardMode(Mode):
big_is_error = config.NanGuardMode.big_is_error
assert nan_is_error or inf_is_error or big_is_error
compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
if cuda.cuda_enabled:
compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
def do_check_on(value, nd, var=None):
"""
......@@ -260,7 +297,10 @@ class NanGuardMode(Mode):
elif value.size == 0:
err = False
elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
compile_gpu_func(False, False, True)
err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
elif pygpu_available and isinstance(value, GpuArray):
err = (f_gpua_absmax(value.reshape(value.size)) > 1e10)
else:
err = (np.abs(value).max() > 1e10)
if err:
......
......@@ -445,7 +445,7 @@ def shape_i(var, i, fgraph=None):
shape_of = shape_feature.shape_of
def recur(node):
if not hasattr(node.outputs[0], 'fgraph'):
if not node.outputs[0] in shape_of:
for inp in node.inputs:
if inp.owner:
recur(inp.owner)
......
......@@ -446,7 +446,7 @@ if param and os.name == 'nt':
def warn_cxx(val):
"""We only support clang++ as otherwise we hit strange g++/OSX bugs."""
if sys.platform == 'darwin' and val != 'clang++':
if sys.platform == 'darwin' and 'clang++' not in val:
_logger.warning("Only clang++ is supported. With g++,"
" we end up with strange g++/OSX bugs.")
return True
......
......@@ -66,7 +66,9 @@ def init_dev(dev, name=None):
single_stream=config.gpuarray.single_stream,
sched=config.gpuarray.sched)
init_dev.devmap[dev] = ctx
if config.gpuarray.preallocate > 0:
if config.gpuarray.preallocate < 0:
print("Disabling allocation cache on %s" % (dev,))
elif config.gpuarray.preallocate > 0:
MB = (1024 * 1024)
if config.gpuarray.preallocate <= 1:
gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem
......
......@@ -1319,8 +1319,6 @@ class GpuDnnSoftmaxBase(DnnBase):
DnnBase.__init__(self, [self.file], self.c_func)
assert(algo in ('fast', 'accurate', 'log'))
if algo == 'log' and version(raises=False) < 3000:
raise RuntimeError("Need cuDNN v3 for log-softmax")
self.algo = algo
assert(mode in ('instance', 'channel'))
......@@ -1361,6 +1359,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
or per spatial location '01' per image across 'c'.
"""
_f16_ok = True
direction = "forward"
file = "dnn_softmax.c"
c_func = "APPLY_SPECIFIC(softmax)"
......@@ -1397,6 +1396,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
image across 'c'.
"""
_f16_ok = True
direction = 'backward'
file = "dnn_softmax_grad.c"
c_func = "APPLY_SPECIFIC(softmax_grad)"
......
......@@ -33,6 +33,12 @@ def as_C_string_const(s):
for l in s.split('\n'))
def get_scal(dt):
if dt == 'float16':
dt = 'float32'
return scalar.get_scalar_type(dt)
class GpuElemwise(HideC, Elemwise):
"""
Elemwise on the GPU.
......@@ -60,23 +66,18 @@ class GpuElemwise(HideC, Elemwise):
zip(out_info[0], out_info[1])]
if len(outputs) > 1:
raise NotImplementedError()
node = Apply(self, inputs, outputs)
# Try to generate the kernel to catch SupportCodeErrors
scal_ins = [get_scal(i.dtype) for i in inputs]
fake_node = self.scalar_op.make_node(*[i() for i in scal_ins])
try:
scal_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
scal_out = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out])
code = self.scalar_op.c_support_code_apply(fake_node, "test")
code = fake_node.op.c_support_code_apply(fake_node, "test")
if code:
raise SupportCodeError(code)
except MethodNotDefined:
pass
try:
support_code = self.scalar_op.c_support_code()
support_code = fake_node.op.c_support_code()
if "struct" in support_code:
# The macro is fine, the C++ struct is not.
raise SupportCodeError(
......@@ -85,6 +86,15 @@ class GpuElemwise(HideC, Elemwise):
except MethodNotDefined:
pass
if fake_node.op != self.scalar_op:
# If the new op is different due to type changes, we make a new
# op for it.
elem = GpuElemwise(fake_node.op, self.inplace_pattern, self.name,
self.nfunc_spec, self.openmp)
else:
elem = self
node = Apply(elem, inputs, outputs)
return node
def get_params(self, node):
......@@ -92,59 +102,31 @@ class GpuElemwise(HideC, Elemwise):
def _get_vnames(self, node):
inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
outs = ['o%d' % (n,) for n, _ in enumerate(node.outputs) if n not in self.inplace_pattern]
outs = ['o%d' % (n,) if n not in self.inplace_pattern else
inps[self.inplace_pattern[n]]
for n, _ in enumerate(node.outputs)]
return inps, outs
def _generate_op_string(self, node):
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
inps, outs = self._get_vnames(node)
scal_v_ins = [get_scal(i.dtype)() for i in node.inputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
[o() for o in scal_v_outs])
fake_node = self.scalar_op.make_node(*scal_v_ins)
scal_v_out = fake_node.outputs
assert len(scal_v_out) == len(node.outputs)
scal_in = [i if si.dtype != 'float16' else
'load_half(&' + i + ')' for i, si in zip(inps, scal_v_ins)]
kop = fake_node.op.c_code(fake_node, 'elem_scalar',
inps, outs,
dict(fail='return;'))
scal_out = []
oi = 0
scal_f16 = []
for n in range(len(node.outputs)):
if n in self.inplace_pattern:
arg = inps[self.inplace_pattern[n]]
else:
arg = outs[oi]
oi += 1
if node.outputs[n].dtype == 'float16':
scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg))
scal_out.append(scal_f16[-1][0])
else:
scal_out.append(arg)
kop = self.scalar_op.c_code(fake_node, 'elem_scalar',
scal_in, scal_out,
dict(fail='return;'))
if scal_f16:
# if we have float16 scalars on output we have to wrap
# them and insert a stand-in float32 variable since
# float16 arithemtic is not available
code = ["{"]
for f in scal_f16:
code.append('ga_float %s;' % (f[0],))
# XXX: The replace is an ugly hack to make sure temp
# variables inthe middle are float32
code.append(kop.replace('npy_float16', 'ga_float'))
for f in scal_f16:
code.append('store_half(&%s, %s);' % (f[1], f[0]))
code.append('}')
kop = '\n'.join(code)
# Some ops like cast will reintroduce float16 in the internal graph.
kop = kop.replace('npy_float16', 'ga_float')
support_code = ""
try:
# We accept only some c_support_code().
# This filter is done in the make_node()
support_code += self.scalar_op.c_support_code()
support_code += fake_node.op.c_support_code()
except MethodNotDefined:
pass
for npy, ga in [("npy_uint8", "ga_ubyte"),
......@@ -171,7 +153,7 @@ class GpuElemwise(HideC, Elemwise):
def c_init_code_struct(self, node, name, sub):
inps, outs = self._get_vnames(node)
nargs = len(inps) + len(outs)
nargs = len(inps) + len(outs) - len(self.inplace_pattern)
support_code, kop = self._generate_op_string(node)
res = """
gpuelemwise_arg args[%(nargs)s] = {{0}};
......@@ -185,24 +167,22 @@ class GpuElemwise(HideC, Elemwise):
""" % dict(n=n, name='"%s"' % (name,),
typecode=i.type.typecode)
p = 0
p = len(inps)
for n, o in enumerate(node.outputs):
if n in self.inplace_pattern:
assert(len(node.outputs) == 1)
res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
else:
nn = len(inps) + p
name = outs[p]
p += 1
res += """
args[%(n)s].name = %(name)s;
args[%(n)s].typecode = %(typecode)s;
args[%(n)s].flags = GE_WRITE;
""" % dict(n=nn, name='"%s"' % (name,),
""" % dict(n=p, name='"%s"' % (outs[n],),
typecode=o.type.typecode)
p += 1
res += """
ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0);
ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, GE_CONVERT_F16);
if (ge == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
%(fail)s
......@@ -363,7 +343,7 @@ class GpuElemwise(HideC, Elemwise):
def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version()
if ver:
return (7, ver)
return (8, ver)
else:
return ver
......
from __future__ import absolute_import, print_function, division
import os
import numpy
from theano import Op, Apply, config
......@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return node.inputs[0].type.context
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
return ['<numpy_compat.h>', '<gpuarray/types.h>', 'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
......@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only')
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
......@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp
nll, sm, am = out
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_nll = node.outputs[0].dtype
dtype_sm = node.outputs[1].dtype
dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__
fail = sub['fail']
ctx = sub['params']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
......@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
""" % locals()
sio = StringIO()
print("""
if (PyGpuArray_NDIM(%(y_idx)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (PyGpuArray_DIMS(%(x)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0])
{
......@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
"dimension mismatch in x,b arguments");
%(fail)s;
}
if ((NULL == %(nll)s) //initial condition
|| (PyGpuArray_DIMS(%(nll)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_x)s, GA_C_ORDER, %(ctx)s,
Py_None);
if (!%(nll)s) {
%(fail)s
}
}
if ((NULL == %(sm)s)
|| (PyGpuArray_DIMS(%(sm)s)[0] !=
PyGpuArray_DIMS(%(x)s)[0])
|| (PyGpuArray_DIMS(%(sm)s)[1] !=
PyGpuArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode_b)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should do it up
%(fail)s;
}
}
if ((NULL == %(am)s)
|| (PyGpuArray_DIMS(%(am)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_y_idx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll and sm,
// the cleanup code should do it up
%(fail)s;
}
}
if (theano_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
if (theano_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
if (theano_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
{
size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
size_t n_shared = n_threads * %(worksize_x)s;
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
//TODO: launch more threads per row and do parallel sum and max reductions
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
(void *)&stride_B0,
(void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
(void *)&stride_YIDX0,
(void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
(void *)&stride_NLL0,
(void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1,
(void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
(void *)&stride_AM0};
int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
int err = k_xent_sm_1hot_bias_call(
1, &n_blocks, &n_threads, n_shared,
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
%(x)s->ga.data, %(x)s->ga.offset,
PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
%(b)s->ga.data, %(b)s->ga.offset,
PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
%(y_idx)s->ga.data, %(y_idx)s->ga.offset,
PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
%(nll)s->ga.data, %(nll)s->ga.offset,
PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
%(sm)s->ga.data, %(sm)s->ga.offset,
PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
%(am)s->ga.data, %(am)s->ga.offset,
PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
%(err_check)s
%(sync)s
}
......@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue()
def c_code_cache_version(self):
return (10,)
return (12,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......
......@@ -797,6 +797,25 @@ class test_SoftMax(test_nnet.test_SoftMax):
def test_softmax_shape_0(self):
raise SkipTest("Cudnn doesn't support 0 shapes")
def test_softmax_f16(self):
x = T.matrix('x', 'float16')
x_gpu = T.tensor4('x_gpu', 'float16')
f_z = T.nnet.softmax_op
f_gpu = dnn.GpuDnnSoftmax(
'accurate',
'channel'
)
def cmp(n, m, f, f_gpu):
data = numpy.random.random((n, m)).astype('float16')
gdata = numpy.asarray(data)[:, :, None, None]
out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
utt.assert_allclose(out, gout)
self._test_softmax(x, x_gpu, f_z, f_gpu, cmp)
def test_softmax_grad(self):
def cmp(n, m, f, f_gpu):
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
......
......@@ -1373,10 +1373,10 @@ class numeric_grad(object):
# perfectly accurate.
type_eps = {'float64': 1e-7,
'float32': 3e-4,
'float16': 1e-3,
'float16': 1e-1,
numpy.dtype('float64'): 1e-7,
numpy.dtype('float32'): 3e-4,
numpy.dtype('float16'): 1e-3}
numpy.dtype('float16'): 1e-1}
def __init__(self, f, pt, eps=None, out_type=None):
"""Return the gradient of f at pt.
......
......@@ -39,7 +39,7 @@ builtin_int = int
builtin_float = float
class ComplexError(Exception):
class ComplexError(NotImplementedError):
"""
Raised if complex numbers are used in an unsupported operation.
......@@ -2197,7 +2197,7 @@ class Sgn(UnaryScalarOp):
return '%(z)s = (%(x)s > 0) ? 1. : ((%(x)s < 0) ? -1. : (isnan(%(x)s) ? NAN : 0.));' % locals()
if type in int_types:
return "%(z)s = (%(x)s >= 0) ? (%(x)s == 0) ? 0 : 1 : -1;" % locals()
raise TypeError() # complex has no sgn
raise ComplexError('complex has no sgn')
def c_code_cache_version(self):
s = super(Sgn, self).c_code_cache_version()
......@@ -2300,7 +2300,7 @@ class RoundHalfToEven(UnaryScalarOp):
(z,) = outputs
typ = node.outputs[0].type.dtype
if typ not in ['float32', 'float64']:
Exception("The output should be float32 or float64")
raise NotImplementedError("The output should be float32 or float64")
return dedent("""
#ifndef ROUNDING_EPSILON
......@@ -2398,7 +2398,7 @@ class RoundHalfAwayFromZero(UnaryScalarOp):
if node.outputs[0].type.dtype in ['float32', 'float64']:
return "%(z)s = round(%(x)s);" % locals()
else:
Exception("The output should be float32 or float64")
raise NotImplementedError("The output should be float32 or float64")
round_half_away_from_zero = RoundHalfAwayFromZero(same_out_float_only)
......@@ -3711,8 +3711,7 @@ class Composite(ScalarOp):
raise NotImplementedError("grad is not implemented for Composite")
def c_code(self, node, nodename, inames, onames, sub):
if not hasattr(self, '_c_code'):
self.init_c_code()
self.init_c_code()
d = dict(chain(izip(("i%i" % i for i in xrange(len(inames))), inames),
izip(("o%i" % i for i in xrange(len(onames))),
......@@ -3746,6 +3745,7 @@ class Composite(ScalarOp):
return "\n".join(sorted(set(rval)))
def c_support_code_apply(self, node, name):
self.init_c_code()
rval = []
for subnode, subnodename in zip(self.fgraph.toposort(), self.nodenames):
try:
......@@ -3771,13 +3771,11 @@ class Composite(ScalarOp):
return False
# see __hash__ for comment on why there is no mention of fgraph
# or module cache key here.
if not hasattr(self, '_c_code'):
self.init_c_code() # self._c_code and self.nodenames
self.init_c_code() # self._c_code and self.nodenames
return (self._c_code == other._c_code)
def __hash__(self):
if not hasattr(self, '_c_code'):
self.init_c_code() # self._c_code and self.nodenames
self.init_c_code() # self._c_code and self.nodenames
rval = hash((type(self),
self.nin,
self.nout,
......
......@@ -2774,6 +2774,7 @@ class Alloc(gof.Op):
are lifted, the first argument to fill can often be pruned from the graph.
"""
_f16_ok = True
__props__ = ()
def validate_shape(self, shape):
......
......@@ -352,7 +352,7 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
# float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
# computation will happen in float32 anyway.
if (node.inputs[0].type == scalar.float32 or
node.inputs[0].type == scalar.float16):
return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
......
......@@ -2247,7 +2247,7 @@ class Assert(T.Op):
>>> func = theano.function([x], assert_op(x, x.size<2))
"""
_f16_ok = True
__props__ = ('msg',)
view_map = {0: [0]}
......@@ -6063,20 +6063,24 @@ def local_log1p(node):
log_arg.owner.inputs, only_process_constants=True)
# scalar_inputs are potentially dimshuffled and fill'd scalars
if scalars and numpy.allclose(numpy.sum(scalars), 1):
if not nonconsts:
pass # leave for constant-merge
if len(nonconsts) == 1:
return _fill_chain(T.log1p(nonconsts[0]), scalar_inputs)
else:
return _fill_chain(T.log1p(T.add(*nonconsts)),
scalar_inputs)
if nonconsts:
if len(nonconsts) > 1:
ninp = T.add(*nonconsts)
else:
ninp = nonconsts[0]
if ninp.dtype != log_arg.type.dtype:
ninp = ninp.astype(node.outputs[0].dtype)
return _fill_chain(T.log1p(ninp), scalar_inputs)
elif log_arg.owner and log_arg.owner.op == T.sub:
one = T.extract_constant(log_arg.owner.inputs[0],
only_process_constants=True)
if one != 1:
return
return [T.log1p(T.neg(log_arg.owner.inputs[1]))]
other = log_arg.owner.inputs[1]
if other.dtype != log_arg.dtype:
other = other.astype(log_arg.dtype)
return [T.log1p(T.neg(other))]
# TODO: in canonicalize, change log10 and log2 -> log
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论