Merge pull request #5049 from abergeron/fix_dlt_f16

Collection of fixes to make the DLT work in float16

Merge pull request #5049 from abergeron/fix_dlt_f16
37d5f777 · Frédéric Bastien · GitHub · 1dabf854 · 22b5da98 · 37d5f777
--- a/theano/compile/nanguardmode.py
+++ b/theano/compile/nanguardmode.py
@@ -10,6 +10,15 @@ from theano.configparser import config
 import theano.tensor as T
 import theano.sandbox.cuda as cuda
 from theano.compile import Mode
+from .mode import get_mode
+try:
+    from theano.gpuarray.type import GpuArrayType, _name_for_ctx
+    from pygpu.gpuarray import GpuArray
+    pygpu_available = True
+except ImportError:
+    pygpu_available = False
 logger = logging.getLogger("theano.compile.nanguardmode")
@@ -86,6 +95,8 @@ def contains_nan(arr, node=None, var=None):
        else:
            compile_gpu_func(True, False, False)
            return np.isnan(f_gpumin(arr.reshape(arr.size)))
+    elif pygpu_available and isinstance(arr, GpuArray):
+        return np.isnan(f_gpua_min(arr.reshape(arr.size)))
    return np.isnan(np.min(arr))
@@ -136,6 +147,9 @@ def contains_inf(arr, node=None, var=None):
            compile_gpu_func(False, True, False)
            return (np.isinf(f_gpumin(arr.reshape(arr.size))) or
                    np.isinf(f_gpumax(arr.reshape(arr.size))))
+    elif pygpu_available and isinstance(arr, GpuArray):
+        return (np.isinf(f_gpua_min(arr.reshape(arr.size))) or
+                np.isinf(f_gpua_max(arr.reshape(arr.size))))
    return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
@@ -187,6 +201,27 @@ def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
            cuda_compile_failed = True
+def f_compute(op):
+    def result(inp):
+        dtype = inp.dtype
+        ctx_name = _name_for_ctx(inp.context)
+        key = (dtype, ctx_name)
+        f = result.cache.get(key, None)
+        if f is None:
+            guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
+            mode = get_mode('FAST_RUN').including('gpuarray')
+            f = theano.function([guard_in], op(guard_in),
+                                mode=mode, profile=False)
+            result.cache[key] = f
+        return f(inp)
+    result.cache = dict()
+    return result
+f_gpua_min = f_compute(T.min)
+f_gpua_max = f_compute(T.max)
+f_gpua_absmax = f_compute(lambda x: T.max(T.abs_(x)))
 class NanGuardMode(Mode):
    """
    A Theano compilation Mode that makes the compiled function automatically
@@ -220,7 +255,9 @@ class NanGuardMode(Mode):
            big_is_error = config.NanGuardMode.big_is_error
        assert nan_is_error or inf_is_error or big_is_error
-        compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
+        if cuda.cuda_enabled:
+            compile_gpu_func(nan_is_error, inf_is_error, big_is_error)
        def do_check_on(value, nd, var=None):
            """
@@ -260,7 +297,10 @@ class NanGuardMode(Mode):
                elif value.size == 0:
                    err = False
                elif cuda.cuda_available and isinstance(value, cuda.CudaNdarray):
+                    compile_gpu_func(False, False, True)
                    err = (f_gpuabsmax(value.reshape(value.size)) > 1e10)
+                elif pygpu_available and isinstance(value, GpuArray):
+                    err = (f_gpua_absmax(value.reshape(value.size)) > 1e10)
                else:
                    err = (np.abs(value).max() > 1e10)
                if err:

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -445,7 +445,7 @@ def shape_i(var, i, fgraph=None):
        shape_of = shape_feature.shape_of
        def recur(node):
-            if not hasattr(node.outputs[0], 'fgraph'):
+            if not node.outputs[0] in shape_of:
                for inp in node.inputs:
                    if inp.owner:
                        recur(inp.owner)

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -446,7 +446,7 @@ if param and os.name == 'nt':
 def warn_cxx(val):
    """We only support clang++ as otherwise we hit strange g++/OSX bugs."""
-    if sys.platform == 'darwin' and val != 'clang++':
+    if sys.platform == 'darwin' and 'clang++' not in val:
        _logger.warning("Only clang++ is supported. With g++,"
                        " we end up with strange g++/OSX bugs.")
    return True

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -66,7 +66,9 @@ def init_dev(dev, name=None):
                         single_stream=config.gpuarray.single_stream,
                         sched=config.gpuarray.sched)
        init_dev.devmap[dev] = ctx
-        if config.gpuarray.preallocate > 0:
+        if config.gpuarray.preallocate < 0:
+            print("Disabling allocation cache on %s" % (dev,))
+        elif config.gpuarray.preallocate > 0:
            MB = (1024 * 1024)
            if config.gpuarray.preallocate <= 1:
                gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1319,8 +1319,6 @@ class GpuDnnSoftmaxBase(DnnBase):
        DnnBase.__init__(self, [self.file], self.c_func)
        assert(algo in ('fast', 'accurate', 'log'))
-        if algo == 'log' and version(raises=False) < 3000:
-            raise RuntimeError("Need cuDNN v3 for log-softmax")
        self.algo = algo
        assert(mode in ('instance', 'channel'))
@@ -1361,6 +1359,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
        or per spatial location '01' per image across 'c'.
    """
+    _f16_ok = True
    direction = "forward"
    file = "dnn_softmax.c"
    c_func = "APPLY_SPECIFIC(softmax)"
@@ -1397,6 +1396,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        image across 'c'.
    """
+    _f16_ok = True
    direction = 'backward'
    file = "dnn_softmax_grad.c"
    c_func = "APPLY_SPECIFIC(softmax_grad)"

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -33,6 +33,12 @@ def as_C_string_const(s):
                     for l in s.split('\n'))
+def get_scal(dt):
+    if dt == 'float16':
+        dt = 'float32'
+    return scalar.get_scalar_type(dt)
 class GpuElemwise(HideC, Elemwise):
    """
    Elemwise on the GPU.
@@ -60,23 +66,18 @@ class GpuElemwise(HideC, Elemwise):
                   zip(out_info[0], out_info[1])]
        if len(outputs) > 1:
            raise NotImplementedError()
-        node = Apply(self, inputs, outputs)
        # Try to generate the kernel to catch SupportCodeErrors
+        scal_ins = [get_scal(i.dtype) for i in inputs]
+        fake_node = self.scalar_op.make_node(*[i() for i in scal_ins])
        try:
-            scal_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
+            code = fake_node.op.c_support_code_apply(fake_node, "test")
-            scal_out = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
-            fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
-                              [o() for o in scal_out])
-            code = self.scalar_op.c_support_code_apply(fake_node, "test")
            if code:
                raise SupportCodeError(code)
        except MethodNotDefined:
            pass
        try:
-            support_code = self.scalar_op.c_support_code()
+            support_code = fake_node.op.c_support_code()
            if "struct" in support_code:
                # The macro is fine, the C++ struct is not.
                raise SupportCodeError(
@@ -85,6 +86,15 @@ class GpuElemwise(HideC, Elemwise):
        except MethodNotDefined:
            pass
+        if fake_node.op != self.scalar_op:
+            # If the new op is different due to type changes, we make a new
+            # op for it.
+            elem = GpuElemwise(fake_node.op, self.inplace_pattern, self.name,
+                               self.nfunc_spec, self.openmp)
+        else:
+            elem = self
+        node = Apply(elem, inputs, outputs)
        return node
    def get_params(self, node):
@@ -92,59 +102,31 @@ class GpuElemwise(HideC, Elemwise):
    def _get_vnames(self, node):
        inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
-        outs = ['o%d' % (n,) for n, _ in enumerate(node.outputs) if n not in self.inplace_pattern]
+        outs = ['o%d' % (n,) if n not in self.inplace_pattern else
+                inps[self.inplace_pattern[n]]
+                for n, _ in enumerate(node.outputs)]
        return inps, outs
    def _generate_op_string(self, node):
-        scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
-        scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
        inps, outs = self._get_vnames(node)
+        scal_v_ins = [get_scal(i.dtype)() for i in node.inputs]
-        fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
+        fake_node = self.scalar_op.make_node(*scal_v_ins)
-                          [o() for o in scal_v_outs])
+        scal_v_out = fake_node.outputs
+        assert len(scal_v_out) == len(node.outputs)
-        scal_in = [i if si.dtype != 'float16' else
+        kop = fake_node.op.c_code(fake_node, 'elem_scalar',
-                   'load_half(&' + i + ')' for i, si in zip(inps, scal_v_ins)]
+                                  inps, outs,
+                                  dict(fail='return;'))
-        scal_out = []
+        # Some ops like cast will reintroduce float16 in the internal graph.
-        oi = 0
+        kop = kop.replace('npy_float16', 'ga_float')
-        scal_f16 = []
-        for n in range(len(node.outputs)):
-            if n in self.inplace_pattern:
-                arg = inps[self.inplace_pattern[n]]
-            else:
-                arg = outs[oi]
-                oi += 1
-            if node.outputs[n].dtype == 'float16':
-                scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg))
-                scal_out.append(scal_f16[-1][0])
-            else:
-                scal_out.append(arg)
-        kop = self.scalar_op.c_code(fake_node, 'elem_scalar',
-                                    scal_in, scal_out,
-                                    dict(fail='return;'))
-        if scal_f16:
-            # if we have float16 scalars on output we have to wrap
-            # them and insert a stand-in float32 variable since
-            # float16 arithemtic is not available
-            code = ["{"]
-            for f in scal_f16:
-                code.append('ga_float %s;' % (f[0],))
-            # XXX: The replace is an ugly hack to make sure temp
-            # variables inthe middle are float32
-            code.append(kop.replace('npy_float16', 'ga_float'))
-            for f in scal_f16:
-                code.append('store_half(&%s, %s);' % (f[1], f[0]))
-            code.append('}')
-            kop = '\n'.join(code)
        support_code = ""
        try:
            # We accept only some c_support_code().
            # This filter is done in the make_node()
-            support_code += self.scalar_op.c_support_code()
+            support_code += fake_node.op.c_support_code()
        except MethodNotDefined:
            pass
        for npy, ga in [("npy_uint8", "ga_ubyte"),
@@ -171,7 +153,7 @@ class GpuElemwise(HideC, Elemwise):
    def c_init_code_struct(self, node, name, sub):
        inps, outs = self._get_vnames(node)
-        nargs = len(inps) + len(outs)
+        nargs = len(inps) + len(outs) - len(self.inplace_pattern)
        support_code, kop = self._generate_op_string(node)
        res = """
        gpuelemwise_arg args[%(nargs)s] = {{0}};
@@ -185,24 +167,22 @@ class GpuElemwise(HideC, Elemwise):
            """ % dict(n=n, name='"%s"' % (name,),
                       typecode=i.type.typecode)
-        p = 0
+        p = len(inps)
        for n, o in enumerate(node.outputs):
            if n in self.inplace_pattern:
                assert(len(node.outputs) == 1)
                res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
            else:
-                nn = len(inps) + p
-                name = outs[p]
-                p += 1
                res += """
                args[%(n)s].name = %(name)s;
                args[%(n)s].typecode = %(typecode)s;
                args[%(n)s].flags = GE_WRITE;
-                """ % dict(n=nn, name='"%s"' % (name,),
+                """ % dict(n=p, name='"%s"' % (outs[n],),
                           typecode=o.type.typecode)
+                p += 1
        res += """
-        ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0);
+        ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, GE_CONVERT_F16);
        if (ge == NULL) {
           PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
           %(fail)s
@@ -363,7 +343,7 @@ class GpuElemwise(HideC, Elemwise):
    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (7, ver)
+            return (8, ver)
        else:
            return ver

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
 from __future__ import absolute_import, print_function, division
+import os
 import numpy
 from theano import Op, Apply, config
@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        return node.inputs[0].type.context
    def c_headers(self):
-        return ['<numpy_compat.h>', '<gpuarray/types.h>']
+        return ['<numpy_compat.h>', '<gpuarray/types.h>', 'gpuarray_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError('cuda only')
-        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
-        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
-        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
        itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
        worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize
        itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
        x, b, y_idx = inp
        nll, sm, am = out
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_nll = node.outputs[0].dtype
-        dtype_sm = node.outputs[1].dtype
-        dtype_am = node.outputs[2].dtype
-        classname = self.__class__.__name__
        fail = sub['fail']
        ctx = sub['params']
        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
            """ % locals()
        sio = StringIO()
        print("""
-        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
-            %(fail)s;
-        }
        if (PyGpuArray_DIMS(%(x)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
-        if ((NULL == %(nll)s) //initial condition
+        if (theano_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-            || (PyGpuArray_DIMS(%(nll)s)[0] !=
+        if (theano_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-                PyGpuArray_DIMS(%(y_idx)s)[0]))
+        if (theano_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-        {
-            Py_XDECREF(%(nll)s);
-            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_x)s, GA_C_ORDER, %(ctx)s,
-                                Py_None);
-            if (!%(nll)s) {
-                %(fail)s
-            }
-        }
-        if ((NULL == %(sm)s)
-            || (PyGpuArray_DIMS(%(sm)s)[0] !=
-                PyGpuArray_DIMS(%(x)s)[0])
-            || (PyGpuArray_DIMS(%(sm)s)[1] !=
-                PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(sm)s);
-            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode_b)s, GA_C_ORDER,
-                                %(ctx)s, Py_None);
-            if(!%(sm)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc sm output");
-                // no need to decref cnda_nll, the cleanup code should do it up
-                %(fail)s;
-            }
-        }
-        if ((NULL == %(am)s)
-            || (PyGpuArray_DIMS(%(am)s)[0] !=
-                PyGpuArray_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(am)s);
-            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
-                                %(typecode_y_idx)s, GA_C_ORDER,
-                                %(ctx)s, Py_None);
-            if(!%(am)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                                "failed to alloc am output");
-                // no need to decref nll and sm,
-                // the cleanup code should do it up
-                %(fail)s;
-            }
-        }
        {
            size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
            size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
            size_t n_shared = n_threads * %(worksize_x)s;
-            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
-            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
-            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
-            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
-            ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
-            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
-            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
-            ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
     //TODO: launch more threads per row and do parallel sum and max reductions
-            void *kernel_params[] = {
+            int err = k_xent_sm_1hot_bias_call(
-                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                1, &n_blocks, &n_threads, n_shared,
-                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                PyGpuArray_DIMS(%(x)s)[0],
-                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                PyGpuArray_DIMS(%(x)s)[1],
-                (void *)&stride_X0, (void *)&stride_X1,
+                %(x)s->ga.data, %(x)s->ga.offset,
-                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
+                PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
-                (void *)&stride_B0,
+                PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
-                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
+                %(b)s->ga.data, %(b)s->ga.offset,
-                (void *)&stride_YIDX0,
+                PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
-                (void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
+                %(y_idx)s->ga.data, %(y_idx)s->ga.offset,
-                (void *)&stride_NLL0,
+                PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
-                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
+                %(nll)s->ga.data, %(nll)s->ga.offset,
-                (void *)&stride_SM0, (void *)&stride_SM1,
+                PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
-                (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
+                %(sm)s->ga.data, %(sm)s->ga.offset,
-                (void *)&stride_AM0};
+                PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
-            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
+                PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
+                %(am)s->ga.data, %(am)s->ga.offset,
+                PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
            %(err_check)s
            %(sync)s
        }
@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
        return sio.getvalue()
    def c_code_cache_version(self):
-        return (10,)
+        return (12,)
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -797,6 +797,25 @@ class test_SoftMax(test_nnet.test_SoftMax):
    def test_softmax_shape_0(self):
        raise SkipTest("Cudnn doesn't support 0 shapes")
+    def test_softmax_f16(self):
+        x = T.matrix('x', 'float16')
+        x_gpu = T.tensor4('x_gpu', 'float16')
+        f_z = T.nnet.softmax_op
+        f_gpu = dnn.GpuDnnSoftmax(
+            'accurate',
+            'channel'
+        )
+        def cmp(n, m, f, f_gpu):
+            data = numpy.random.random((n, m)).astype('float16')
+            gdata = numpy.asarray(data)[:, :, None, None]
+            out = f(data)
+            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
+            utt.assert_allclose(out, gout)
+        self._test_softmax(x, x_gpu, f_z, f_gpu, cmp)
    def test_softmax_grad(self):
        def cmp(n, m, f, f_gpu):
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -1373,10 +1373,10 @@ class numeric_grad(object):
    # perfectly accurate.
    type_eps = {'float64': 1e-7,
                'float32': 3e-4,
-                'float16': 1e-3,
+                'float16': 1e-1,
                numpy.dtype('float64'): 1e-7,
                numpy.dtype('float32'): 3e-4,
-                numpy.dtype('float16'): 1e-3}
+                numpy.dtype('float16'): 1e-1}
    def __init__(self, f, pt, eps=None, out_type=None):
        """Return the gradient of f at pt.

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -39,7 +39,7 @@ builtin_int = int
 builtin_float = float
-class ComplexError(Exception):
+class ComplexError(NotImplementedError):
    """
    Raised if complex numbers are used in an unsupported operation.
@@ -2197,7 +2197,7 @@ class Sgn(UnaryScalarOp):
            return '%(z)s = (%(x)s > 0) ? 1. : ((%(x)s < 0) ? -1. : (isnan(%(x)s) ? NAN : 0.));' % locals()
        if type in int_types:
            return "%(z)s = (%(x)s >= 0) ? (%(x)s == 0) ? 0 : 1 : -1;" % locals()
-        raise TypeError()  # complex has no sgn
+        raise ComplexError('complex has no sgn')
    def c_code_cache_version(self):
        s = super(Sgn, self).c_code_cache_version()
@@ -2300,7 +2300,7 @@ class RoundHalfToEven(UnaryScalarOp):
        (z,) = outputs
        typ = node.outputs[0].type.dtype
        if typ not in ['float32', 'float64']:
-            Exception("The output should be float32 or float64")
+            raise NotImplementedError("The output should be float32 or float64")
        return dedent("""
            #ifndef ROUNDING_EPSILON
@@ -2398,7 +2398,7 @@ class RoundHalfAwayFromZero(UnaryScalarOp):
        if node.outputs[0].type.dtype in ['float32', 'float64']:
            return "%(z)s = round(%(x)s);" % locals()
        else:
-            Exception("The output should be float32 or float64")
+            raise NotImplementedError("The output should be float32 or float64")
 round_half_away_from_zero = RoundHalfAwayFromZero(same_out_float_only)
@@ -3711,8 +3711,7 @@ class Composite(ScalarOp):
        raise NotImplementedError("grad is not implemented for Composite")
    def c_code(self, node, nodename, inames, onames, sub):
-        if not hasattr(self, '_c_code'):
+        self.init_c_code()
-            self.init_c_code()
        d = dict(chain(izip(("i%i" % i for i in xrange(len(inames))), inames),
                       izip(("o%i" % i for i in xrange(len(onames))),
@@ -3746,6 +3745,7 @@ class Composite(ScalarOp):
        return "\n".join(sorted(set(rval)))
    def c_support_code_apply(self, node, name):
+        self.init_c_code()
        rval = []
        for subnode, subnodename in zip(self.fgraph.toposort(), self.nodenames):
            try:
@@ -3771,13 +3771,11 @@ class Composite(ScalarOp):
            return False
        # see __hash__ for comment on why there is no mention of fgraph
        # or module cache key here.
-        if not hasattr(self, '_c_code'):
+        self.init_c_code()    # self._c_code and self.nodenames
-            self.init_c_code()    # self._c_code and self.nodenames
        return (self._c_code == other._c_code)
    def __hash__(self):
-        if not hasattr(self, '_c_code'):
+        self.init_c_code()    # self._c_code and self.nodenames
-            self.init_c_code()    # self._c_code and self.nodenames
        rval = hash((type(self),
                    self.nin,
                    self.nout,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2774,6 +2774,7 @@ class Alloc(gof.Op):
    are lifted, the first argument to fill can often be pruned from the graph.
    """
+    _f16_ok = True
    __props__ = ()
    def validate_shape(self, shape):

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -352,7 +352,7 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
        # float16 limits: -17.0, 6.0
        # We use the float32 limits for float16 for now as the
-        # computation will happend in float32 anyway.
+        # computation will happen in float32 anyway.
        if (node.inputs[0].type == scalar.float32 or
                node.inputs[0].type == scalar.float16):
            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2247,7 +2247,7 @@ class Assert(T.Op):
    >>> func = theano.function([x], assert_op(x, x.size<2))
    """
+    _f16_ok = True
    __props__ = ('msg',)
    view_map = {0: [0]}
@@ -6063,20 +6063,24 @@ def local_log1p(node):
                log_arg.owner.inputs, only_process_constants=True)
            # scalar_inputs are potentially dimshuffled and fill'd scalars
            if scalars and numpy.allclose(numpy.sum(scalars), 1):
-                if not nonconsts:
+                if nonconsts:
-                    pass  # leave for constant-merge
+                    if len(nonconsts) > 1:
-                if len(nonconsts) == 1:
+                        ninp = T.add(*nonconsts)
-                    return _fill_chain(T.log1p(nonconsts[0]), scalar_inputs)
+                    else:
-                else:
+                        ninp = nonconsts[0]
-                    return _fill_chain(T.log1p(T.add(*nonconsts)),
+                    if ninp.dtype != log_arg.type.dtype:
-                                       scalar_inputs)
+                        ninp = ninp.astype(node.outputs[0].dtype)
+                    return _fill_chain(T.log1p(ninp), scalar_inputs)
        elif log_arg.owner and log_arg.owner.op == T.sub:
            one = T.extract_constant(log_arg.owner.inputs[0],
                                     only_process_constants=True)
            if one != 1:
                return
-            return [T.log1p(T.neg(log_arg.owner.inputs[1]))]
+            other = log_arg.owner.inputs[1]
+            if other.dtype != log_arg.dtype:
+                other = other.astype(log_arg.dtype)
+            return [T.log1p(T.neg(other))]
 # TODO: in canonicalize, change log10 and log2 -> log