Merge pull request #2800 from abergeron/fp16_merge

Fp16 merge

Merge pull request #2800 from abergeron/fp16_merge
1547ecc6 · Frédéric Bastien · 03d0e784 · 3226ffd2 · 1547ecc6 · 1547ecc6
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -212,6 +212,21 @@ There are less methods to define for an Op than for a Type:
       Op *must* have a `context_type` property with the Type to use
       for the context variable.
+    .. attribute:: _f16_ok
+       (optional) If this attribute is absent or evaluates to `False`,
+       C code will be disabled for the op if any of its inputs or
+       outputs contains float16 data. This is added as a check to make
+       sure we don't compute wrong results since there is no hardware
+       float16 type so special care must be taken to make sure
+       operations are done correctly.
+       If you don't intend to deal with float16 data you can leave
+       this undefined.
+       This attribute is internal and may go away at any point during
+       developpment if a better solution is found.
 The ``name`` argument is currently given an invalid value, so steer
 away from it. As was the case with Type, ``sub['fail']`` provides
 failure code that you *must* use if you want to raise an exception,

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -215,6 +215,8 @@ class Shape(gof.Op):
    @note: Non-differentiable.
    """
+    _f16_ok = True
    # Mapping from Type to C code (and version) to use.
    # In the C code, the name of the input variable is %(iname)s,
    # the output variable is %(oname)s.
@@ -308,6 +310,8 @@ class Shape_i(gof.Op):
    @note: Non-differentiable.
    """
+    _f16_ok = True
    # Mapping from Type to C code (and version) to use.
    # In the C code, the name of the input variable is %(iname)s,
    # the output variable is %(oname)s.

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -18,12 +18,17 @@ def floatX_convert(s):
        return "float32"
    elif s == "64":
        return "float64"
+    elif s == "16":
+        return "float16"
    else:
        return s
 AddConfigVar('floatX',
-             "Default floating-point precision for python casts",
+             "Default floating-point precision for python casts.\n"
-             EnumStr('float64', 'float32', convert=floatX_convert,),
+             "\n"
+             "Note: float16 support is experimental, use at your own risk.",
+             EnumStr('float64', 'float32', 'float16',
+                     convert=floatX_convert,),
             )
 AddConfigVar('warn_float64',
@@ -39,7 +44,7 @@ AddConfigVar('cast_policy',
             EnumStr('custom', 'numpy+floatX',
                     # The 'numpy' policy was originally planned to provide a
                     # smooth transition from numpy. It was meant to behave the
-                     # same asnumpy+floatX, but keeping float64 when numpy
+                     # same as numpy+floatX, but keeping float64 when numpy
                     # would. However the current implementation of some cast
                     # mechanisms makes it a bit more complex to add than what
                     # was expected, so it is currently not available.

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -726,9 +726,20 @@ class Op(utils.object2, PureOp, CLinkerOp):
        node_output_storage = [storage_map[r] for r in node.outputs]
        node_input_compute = [compute_map[r] for r in node.inputs]
        node_output_compute = [compute_map[r] for r in node.outputs]
-        #logger.debug('Compiling node %i of graph' % node_idx)
        if self._op_use_c_code:
            try:
+                # float16 get special treatment since running
+                # unprepared C code will get bad results.
+                if not getattr(self, '_f16_ok', False):
+                    def is_f16(t):
+                        return getattr(t, 'dtype', '') == 'float16'
+                    if (any(is_f16(i.type) for i in node.inputs) or
+                            any(is_f16(o.type) for o in node.outputs)):
+                        print ("Disabling C code for %s due to unsupported "
+                               "float16" % (self,))
+                        raise NotImplementedError("float16")
                e = FunctionGraph(node.inputs, node.outputs)
                e_no_recycling = [new_o

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -20,6 +20,7 @@ except ImportError:
    pass
 from .type import GpuArrayType
+from .fp16_help import write_w
 def as_gpuarray_variable(x):
@@ -186,11 +187,8 @@ class GpuKernelBase(object):
 class HostFromGpu(Op):
-    def __eq__(self, other):
+    __props__ = ()
-        return type(self) == type(other)
+    _f16_ok = True
-    def __hash__(self):
-        return hash(type(self))
    def __str__(self):
        return 'HostFromGpu(gpuarray)'
@@ -269,11 +267,8 @@ host_from_gpu = HostFromGpu()
 class GpuFromHost(Op):
-    def __eq__(self, other):
+    __props__ = ()
-        return type(self) == type(other)
+    _f16_ok = True
-    def __hash__(self):
-        return hash(type(self))
    def __str__(self):
        return 'GpuFromHost(gpuarray)'
@@ -573,18 +568,15 @@ cuda_from_gpu = CudaFromGpu()
 class GpuAlloc(HideC, Alloc):
+    __props__ = ('memset_0',)
+    _f16_ok = True
    def __init__(self, memset_0=False):
        """memset_0 is only an optimized version. True, it mean the
        value is always 0, so the c code call memset as it is faster.
        """
        self.memset_0 = memset_0
-    def __eq__(self, other):
-        return type(self) == type(other) and self.memset_0 == other.memset_0
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.memset_0)
    def __str__(self):
        # Hide the memset parameter when not used to prevent confusion.
        if self.memset_0:
@@ -728,25 +720,17 @@ class GpuContiguous(Op):
    Always return a c contiguous output. Copy the input only if it is
    not already c contiguous.
    """
+    __props__ = ()
    view_map = {0: [0]}
+    _f16_ok = True
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
    def grad(self, inputs, dout):
        x, = inputs
        dout, = dout
        dout = as_gpuarray_variable(dout)
        return [dout]
-    def __str__(self):
-        return self.__class__.__name__
    def make_node(self, input):
        input = as_gpuarray_variable(input)
        return Apply(self, [input], [input.type()])
@@ -794,6 +778,8 @@ class GpuReshape(HideC, tensor.Reshape):
    """
    Implement Reshape on the gpu.
    """
+    _f16_ok = True
    # __hash__, __eq__, __str__ come from tensor.Reshape
    def make_node(self, x, shp):
        x = as_gpuarray_variable(x)
@@ -831,6 +817,8 @@ class GpuReshape(HideC, tensor.Reshape):
 class GpuJoin(HideC, Join):
+    _f16_ok = True
    def make_node(self, axis, *tensors):
        node = Join.make_node(self, axis, *tensors)
@@ -888,6 +876,9 @@ class GpuSplit(HideC, Split):
 class GpuEye(GpuKernelBase, Op):
+    __props__ = ('dtype',)
+    _f16_ok = True
    def __init__(self, dtype=None):
        if dtype is None:
            dtype = config.floatX
@@ -915,20 +906,15 @@ class GpuEye(GpuKernelBase, Op):
        return [grad_undefined(self, i, inp[i])
                for i in xrange(3)]
-    def __eq__(self, other):
-        return type(self) == type(other) and self.dtype == other.dtype
-    def __hash__(self):
-        return hash(self.dtype) ^ hash(type(self))
    def gpu_kernels(self, node, name):
        code = """
 KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
    ga_size nb = n < m ? n : m;
    for (ga_size i = LID_0; i < nb; i += LDIM_0) {
-        a[i*m + i] = 1;
+        a[i*m + i] = %(write_a)s(1);
    }
-}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
+}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
+            name=name, write_a=write_w(self.dtype))
        return [Kernel(
                code=code, name="k",
                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/fp16_help.py
+++ b/theano/sandbox/gpuarray/fp16_help.py
+def work_dtype(dtype):
+    if dtype == 'float16':
+        return 'float32'
+    else:
+        return dtype
+def load_w(dtype):
+    if dtype == 'float16':
+        return '__half2float'
+    else:
+        return ''
+def write_w(dtype):
+    if dtype == 'float16':
+        return '__float2half_rn'
+    else:
+        return ''
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -16,6 +16,7 @@ from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
                             inline_softmax,
                             inline_softmax_fixed_shared)
+from .fp16_help import work_dtype, load_w, write_w
 class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
@@ -24,15 +25,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
    """
    nin = 3
    nout = 3
+    __props__ = ()
-    def __eq__(self, other):
+    _f16_ok = True
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
    def make_node(self, x, b, y_idx):
        # N.B. won't work when we don't cast y_idx to float anymore
@@ -52,6 +46,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
+        work_x = work_dtype(dtype_x)
+        work_b = work_dtype(dtype_b)
+        load_x = load_w(dtype_x)
+        load_b = load_w(dtype_b)
+        write_x = write_w(dtype_x)
+        write_b = write_w(dtype_b)
        return """
        __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
            const npy_%(dtype_x)s* x_data, int xs0, int xs1,
@@ -67,12 +67,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
            npy_%(dtype_x)s* sm = sm_data + sms0 * row;
-            npy_%(dtype_x)s sum = 0.0;
+            npy_%(work_x)s sum = 0.0;
            int row_max_j = 0;
-            npy_%(dtype_x)s row_max = x[0] + b[0];
+            npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
            for (int j = 1; j < N; ++j)
            {
-                npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
+                npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
+                                        %(load_b)s(b[j*bs0]);
                //todo: store to shared memory
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
@@ -80,27 +81,30 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            //compute the exp
            for (int j = 0; j < N; ++j)
            {
-                npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
+                npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
-                npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
+                                        %(load_b)s(b[j*bs0]);
+                npy_%(work_x)s sm_ij = exp(row_ij - row_max);
                sum += sm_ij;
-                sm[j * sms1] = sm_ij;
+                sm[j * sms1] = %(write_x)s(sm_ij);
            }
-            npy_%(dtype_x)s sum_inv = 1.0 / sum;
+            npy_%(work_x)s sum_inv = 1.0 / sum;
            for (int j = 0; j < N; ++j)
            {
-                sm[j * sms1] *= sum_inv;
+                npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]);
+                __tmp *= sum_inv;
+                sm[j * sms1] = %(write_x)s(__tmp);
            }
            if ((y_idx >= N) || (y_idx < 0))
            {
                //TODO: set raise an error bit in a global var?
-                nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
+                nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
            }
            else
            {
-                nll_data[row*nlls0] = - x[y_idx*xs1]
+                nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
-                           - b[y_idx*bs0]
+                           - %(load_b)s(b[y_idx*bs0])
                           + row_max
-                           + log(sum);
+                           + log(sum));
            }
            am_data[row*ams0] = row_max_j;
          }
@@ -259,8 +263,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        return sio.getvalue()
    def c_code_cache_version(self):
-        # return ()
+        return (6,)
-        return (5,)
    def c_compiler(self):
        return NVCC_compiler
@@ -272,21 +275,13 @@ gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1Ho
 class GpuCrossentropySoftmax1HotWithBiasDx(Op):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
+    Gradient wrt x of the CrossentropySoftmax1Hot Op
    """
    nin = 3
    nout = 1
-    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
+    __props__ = ()
-    def __init__(self, **kwargs):
+    _f16_ok = True
-        Op.__init__(self, **kwargs)
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
    def make_node(self, dnll, sm, y_idx):
        dnll = as_gpuarray_variable(dnll)
@@ -295,8 +290,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        return Apply(self, [dnll, sm, y_idx], [sm.type()])
    def c_code_cache_version(self):
-        # return ()
+        return (9,)
-        return (8,)
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
@@ -421,6 +415,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
+        work_dnll = work_dtype(dtype_dnll)
+        load_dnll = load_w(dtype_dnll)
+        load_sm = load_w(dtype_sm)
+        write_dx = write_w(dtype_dx)
        return """
        __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
           int N, int K,
@@ -431,7 +429,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        {
            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
-                npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
+                npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
                for (int j = threadIdx.x; j < K; j += blockDim.x)
@@ -439,16 +437,15 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
                    if (y_i == j)
                    {
                        dx[i * dx_s0 + j * dx_s1] =
-                            dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
+                            %(write_dx)s(dnll_i *
+                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
                    }
                    else
                    {
                        dx[i * dx_s0 + j * dx_s1] =
-                            dnll_i * sm[i * sm_s0 + j * sm_s1];
+                            %(write_dx)s(dnll_i *
+                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
                    }
-                    //dx[i * dx_s0 + j * dx_s1] =
-                    //    dnll_i * sm[i * sm_s0 + j * sm_s1];
-                    //dx[i*dx_s0+j*dx_s1] = 0;
                }
            }
        }
@@ -466,14 +463,7 @@ class GpuSoftmax (Op):
    """
    Implement Softmax on the gpu.
    """
-    def __eq__(self, other):
+    __props__ = ()
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
    def make_node(self, x):
        x = as_gpuarray_variable(x)
@@ -658,15 +648,7 @@ class GpuSoftmaxWithBias (Op):
    """
    nin = 2
    nout = 1
+    __props__ = ()
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    def __str__(self):
-        return self.__class__.__name__
    def make_node(self, x, b):
        x = as_gpuarray_variable(x)

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -10,9 +10,7 @@ except ImportError:
 from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
-                        SequenceDB, ProxyDB,
+                        SequenceDB, Optimizer, toolbox)
-                        Optimizer, toolbox,
-                        InconsistencyError, EquilibriumOptimizer)
 from theano.scan_module import scan_utils, scan_op, scan_opt
@@ -28,8 +26,7 @@ from .conv import GpuConv
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
-from .elemwise import (GpuElemwise, _is_scalar,
+from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
-                       GpuDimShuffle, GpuCAReduceCuda,
                       GpuCAReduceCPY)
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
@@ -134,7 +131,7 @@ class InputToGpuOptimizer(Optimizer):
                new_input = host_from_gpu(gpu_from_host(input))
                fgraph.replace_validate(input, new_input,
                                        "InputToGpuOptimizer")
-            except TypeError as e:
+            except TypeError:
                # This could fail if the inputs are not TensorTypes
                pass
@@ -253,10 +250,11 @@ def local_gpuflatten(node):
 @op_lifter([tensor.Elemwise])
 def local_gpu_elemwise(node):
    op = node.op
+    scal_op = op.scalar_op
    name = op.name
    if name:
        name = 'Gpu'+name
-    res = GpuElemwise(op.scalar_op, name=name,
+    res = GpuElemwise(scal_op, name=name,
                      inplace_pattern=copy.copy(op.inplace_pattern),
                      nfunc_spec=op.nfunc_spec)
    return res
@@ -391,7 +389,6 @@ def local_gpua_advanced_incsubtensor(node):
        return None
    x, y = node.inputs[0:2]
-    coords = node.inputs[2:]
    set_instead_of_inc = node.op.set_instead_of_inc
    active_device_no = theano.sandbox.cuda.active_device_number()
    device_properties = theano.sandbox.cuda.device_properties
@@ -640,8 +637,7 @@ def local_gpu_elemwise_careduce(node):
            # automatically add more case, as some like trigonometic
            # operation with some reduction pattern will probably result
            # to slow down.
-        isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
+            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
-        ):
        op = node.op
        inp = node.inputs[0].owner.inputs[0]
        return [GpuCAReduceCuda(scalar_op=op.scalar_op,

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -21,6 +21,8 @@ from .comp import NVCC_compiler
 class GpuSubtensor(HideC, Subtensor):
+    _f16_ok = True
    def make_node(self, x, *inputs):
        rval = tensor.Subtensor.make_node(self, x, *inputs)
        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -436,7 +436,7 @@ def test_gpueye():
        assert any([isinstance(node.op, GpuEye)
                    for node in f.maker.fgraph.toposort()])
-    for dtype in ['float32', 'int32']:
+    for dtype in ['float32', 'int32', 'float16']:
        yield check, dtype, 3
        # M != N, k = 0
        yield check, dtype, 3, 5

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -185,6 +185,7 @@ class GpuArrayType(Type):
        # complex64, etc.
        try:
            return {
+                'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
                'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
                'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
                'uint8': (int, 'npy_uint8', 'NPY_UINT8'),
@@ -309,7 +310,11 @@ class GpuArrayConstant(_operators, Constant):
    def __str__(self):
        if self.name is not None:
            return self.name
-        return "GpuArrayConstant{%s}" % numpy.asarray(self.data)
+        try:
+            np_data = numpy.asarray(self.data)
+        except gpuarray.GpuArrayException:
+            np_data = self.data
+        return "GpuArrayConstant{%s}" % np_data
 GpuArrayType.Constant = GpuArrayConstant

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -613,18 +613,15 @@ def test_binomial():
                # test empty size (scalar)
                ((), (), [], []),
                ]:
+            yield (t_binomial, mean, size, const_size, var_input, input,
+                   steps, rtol)
-            # print ''
-            # print 'ON CPU with size=(%s) and mean(%d):' % (str(size), mean)
+def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
    R = MRG_RandomStreams(234, use_cuda=False)
-            # Note: we specify `nstreams` to avoid a warning.
+    u = R.binomial(size=size, p=mean)
-            u = R.binomial(size=size, p=mean,
-                           nstreams=rng_mrg.guess_n_streams(size, warn=False))
    f = theano.function(var_input, u, mode=mode)
-            # theano.printing.debugprint(f)
    out = f(*input)
-            # print 'random?[:10]\n', out[0, 0:10]
-            # print 'random?[-1,-10:]\n', out[-1, -10:]
    # Increase the number of steps if sizes implies only a few samples
    if numpy.prod(const_size) < 10:
@@ -636,30 +633,21 @@ def test_binomial():
              target_avg=mean, mean_rtol=rtol)
    if mode != 'FAST_COMPILE' and cuda_available:
-                # print ''
-                # print 'ON GPU with size=(%s) and mean(%d):' % (str(size), mean)
        R = MRG_RandomStreams(234, use_cuda=True)
-                u = R.binomial(size=size, p=mean, dtype='float32',
+        u = R.binomial(size=size, p=mean, dtype='float32')
-                               nstreams=rng_mrg.guess_n_streams(size,
-                                                                warn=False))
        # well, it's really that this test w GPU doesn't make sense otw
        assert u.dtype == 'float32'
        f = theano.function(var_input, theano.Out(
                theano.sandbox.cuda.basic_ops.gpu_from_host(u),
                borrow=True), mode=mode_with_gpu)
-                # theano.printing.debugprint(f)
        gpu_out = numpy.asarray(f(*input))
-                # print 'random?[:10]\n', gpu_out[0, 0:10]
-                # print 'random?[-1,-10:]\n', gpu_out[-1, -10:]
        basictest(f, steps_, const_size, prefix='mrg  gpu',
                  inputs=input, allow_01=True,
                  target_avg=mean, mean_rtol=rtol)
        numpy.testing.assert_array_almost_equal(out, gpu_out,
                                                decimal=6)
-            # print ''
-            # print 'ON CPU w NUMPY with size=(%s) and mean(%d):' % (str(size),
-            #                                                       mean)
    RR = theano.tensor.shared_randomstreams.RandomStreams(234)
    uu = RR.binomial(size=size, p=mean)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -50,25 +50,33 @@ class IntegerDivisionError(Exception):
 def upcast(dtype, *dtypes):
-    # Should we try to keep float32 instead of float64? This is used so that
+    # This tries to keep data in floatX or lower precision, unless we
-    # for instance mixing int64 with float32 yields float32 instead of float64.
+    # explicitely request a higher precision datatype.
-    # Note that we store this boolean as a one-element list so that it can be
-    # modified within `make_array`.
    keep_float32 = [(config.cast_policy == 'numpy+floatX' and
                     config.floatX == 'float32')]
+    keep_float16 = [(config.cast_policy == 'numpy+floatX' and
+                     config.floatX == 'float16')]
    def make_array(dt):
        if dt == 'float64':
            # There is an explicit float64 dtype: we cannot keep float32.
            keep_float32[0] = False
+            keep_float16[0] = False
+        if dt == 'float32':
+            keep_float16[0] = False
        return numpy.zeros((), dtype=dt)
    z = make_array(dtype)
    for dt in dtypes:
        z = z + make_array(dt=dt)
    rval = str(z.dtype)
-    if rval == 'float64' and keep_float32[0]:
+    if rval == 'float64':
+        if keep_float16[0]:
+            return 'float16'
+        if keep_float32[0]:
            return 'float32'
-    else:
+    elif rval == 'float32':
+        if keep_float16[0]:
+            return 'float16'
    return rval
@@ -232,6 +240,7 @@ class Scalar(Type):
                print(dtype, np.zeros(1, dtype=dtype).dtype.num)
            """
            return {  # dtype: (py_type, c_type, cls_name)
+                    'float16': (numpy.float16, 'npy_float16', 'Float16'),
                    'float32': (numpy.float32, 'npy_float32', 'Float32'),
                    'float64': (numpy.float64, 'npy_float64', 'Float64'),
                    'complex128': (numpy.complex128, 'theano_complex128',
@@ -501,6 +510,7 @@ uint8 = get_scalar_type('uint8')
 uint16 = get_scalar_type('uint16')
 uint32 = get_scalar_type('uint32')
 uint64 = get_scalar_type('uint64')
+float16 = get_scalar_type('float16')
 float32 = get_scalar_type('float32')
 float64 = get_scalar_type('float64')
 complex64 = get_scalar_type('complex64')
@@ -508,7 +518,7 @@ complex128 = get_scalar_type('complex128')
 int_types = int8, int16, int32, int64
 uint_types = uint8, uint16, uint32, uint64
-float_types = float32, float64
+float_types = float16, float32, float64
 complex_types = complex64, complex128
 discrete_types = int_types + uint_types
@@ -1995,6 +2005,7 @@ convert_to_uint8 = Cast(uint8, name='convert_to_uint8')
 convert_to_uint16 = Cast(uint16, name='convert_to_uint16')
 convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
 convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
+convert_to_float16 = Cast(float16, name='convert_to_float16')
 convert_to_float32 = Cast(float32, name='convert_to_float32')
 convert_to_float64 = Cast(float64, name='convert_to_float64')
 convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
@@ -2009,6 +2020,7 @@ _cast_mapping = {
           'uint16': convert_to_uint16,
           'uint32': convert_to_uint32,
           'uint64': convert_to_uint64,
+           'float16': convert_to_float16,
           'float32': convert_to_float32,
           'float64': convert_to_float64,
           'complex64': convert_to_complex64,
@@ -3286,14 +3298,20 @@ class Composite(ScalarOp):
                + zip(self.fgraph.outputs,
                    ["%%(o%i)s" % i for i in xrange(len(self.fgraph.outputs))]))
-        for orphan in self.fgraph.variables:  # fgraph.orphans:
+        for var in self.fgraph.variables:
-            if orphan.owner is None and orphan not in self.fgraph.inputs:
+            if var.owner is None:
-                if isinstance(orphan, Constant):
+                if var not in self.fgraph.inputs:
-                    subd[orphan] = orphan.type.c_literal(orphan.data)
+                    # This is an orphan
+                    if isinstance(var, Constant):
+                        subd[var] = var.type.c_literal(var.data)
                    else:
                        raise ValueError(
                            "All orphans in the fgraph to Composite must"
                            " be Constant instances.")
+            elif (any(i.dtype == 'float16' for i in var.owner.inputs) or
+                      any(o.dtype == 'float16' for o in var.owner.outputs)):
+                # flag for elemwise ops to check.
+                self.inner_float16 = True
        _c_code = "{\n"
        self.nodenames = ["%(nodename)s_" + ('subnode%i' % j)

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -2370,6 +2370,9 @@ class CastTester(utt.InferShapeTester):
        for format in sparse.sparse_formats:
            for i_dtype in sparse.float_dtypes:
                for o_dtype in tensor.float_dtypes:
+                    if o_dtype == 'float16':
+                        # Don't test float16 output.
+                        continue
                    _, data = sparse_random_inputs(
                        format,
                        shape=(4, 7),

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -252,10 +252,10 @@ class NumpyAutocaster(object):
            return numpy.asarray(x)
        elif config.cast_policy == 'numpy+floatX':
            rval = numpy.asarray(x)
-            if ((rval.dtype == 'float64' and         # numpy wants float64
+            if ((not hasattr(x, 'dtype') and
-                 config.floatX == 'float32' and      # but we prefer float32
+                 rval.dtype in ('float64', 'float32') and
-                 not hasattr(x, 'dtype'))):           # and `x` was not typed
+                 rval.dtype != config.floatX)):
-                rval = theano._asarray(rval, dtype='float32')
+                rval = theano._asarray(rval, dtype=config.floatX)
            return rval
        # The following is the original code, corresponding to the 'custom'
@@ -278,11 +278,14 @@ class NumpyAutocaster(object):
        # recall: float is numpy.float
        if ((isinstance(x, float) and
             config.floatX in self.dtypes and
-             config.floatX == 'float32')):
+             config.floatX != 'float64')):
+            return theano._asarray(x, dtype=config.floatX)
-            return theano._asarray(x, dtype='float32')
+        # Don't autocast to float16 unless config.floatX is float16
+        try_dtypes = [d for d in self.dtypes
+                      if config.floatX == 'float16' or d != 'float16']
-        for dtype in self.dtypes:
+        for dtype in try_dtypes:
            x_ = theano._asarray(x, dtype=dtype)
            if numpy.all(x == x_):
                break
@@ -290,7 +293,7 @@ class NumpyAutocaster(object):
        return x_
 autocast_int = NumpyAutocaster(('int8', 'int16', 'int32', 'int64'))
-autocast_float = NumpyAutocaster(('float32', 'float64'))
+autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
 # autocast_float dtypes might be manipulated in tensor.__init__
@@ -313,7 +316,7 @@ class autocast_float_as(object):
    If `config.cast_policy` is not 'custom', an exception is raised.
    For example:
-    >>> with autocast_float_as('float32') as _dummy:
+    >>> with autocast_float_as('float32'):
    ...    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
    >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
@@ -1137,6 +1140,10 @@ _convert_to_uint64 = _conversion(
    elemwise.Elemwise(scal.convert_to_uint64), 'uint64')
 """Cast to unsigned 64-bit integer"""
+_convert_to_float16 = _conversion(
+    elemwise.Elemwise(scal.convert_to_float16), 'float16')
+"""Cast to half-precision floating point"""
 _convert_to_float32 = _conversion(
    elemwise.Elemwise(scal.convert_to_float32), 'float32')
 """Cast to single-precision floating point"""
@@ -1162,6 +1169,7 @@ _cast_mapping = {
    'uint16': _convert_to_uint16,
    'uint32': _convert_to_uint32,
    'uint64': _convert_to_uint64,
+    'float16': _convert_to_float16,
    'float32': _convert_to_float32,
    'float64': _convert_to_float64,
    'complex64': _convert_to_complex64,
@@ -2757,9 +2765,13 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
        # sum() will complain if it is not suitable.
        sum_dtype = dtype
    else:
-        # Let sum() infer the appropriate dtype.
        sum_dtype = None
+    # float16 overflows way too fast for sum
+    if ((sum_dtype == 'float16' or input.dtype == 'float16') and
+            acc_dtype != 'float16'):
+        sum_dtype == 'float32'
    s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
            acc_dtype=acc_dtype)
    shp = shape(input)
@@ -2767,7 +2779,7 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
    # Cast shp into a float type
    # TODO Once we have a consistent casting policy, we could simply
    # use true_div.
-    if s.dtype in ('float32', 'complex64'):
+    if s.dtype in ('float16', 'float32', 'complex64'):
        shp = cast(shp, 'float32')
    else:
        shp = cast(shp, 'float64')
@@ -2785,6 +2797,9 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False,
    for i in axis:
        s = true_div(s, shp[i])
+    if dtype == 'float16' or (dtype is None and input.dtype == 'float16'):
+        s = cast(s, 'float16')
    return s

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -417,7 +417,8 @@ class Gemv(Op):
    def perform(self, node, inputs, out_storage):
        y, alpha, A, x, beta = inputs
-        if have_fblas and y.shape[0] != 0 and x.shape[0] != 0:
+        if (have_fblas and y.shape[0] != 0 and x.shape[0] != 0 and
+                y.dtype in _blas_gemv_fns):
            gemv = _blas_gemv_fns[y.dtype]
            if (A.shape[0] != y.shape[0] or A.shape[1] != x.shape[0]):
@@ -1727,7 +1728,7 @@ def local_dot_to_dot22(node):
                     x, y, x.type, y.type)
        return
-    if y.type.dtype.startswith('float') or y.type.dtype.startswith('complex'):
+    if y.type.dtype in ['float32', 'float64', 'complex64', 'complex128']:
        if x.ndim == 2 and y.ndim == 2:
            # print "local_dot_to_dot22: MM"
            return [_dot22(*node.inputs)]

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -95,6 +95,7 @@ class DimShuffle(Op):
    transpose function.
    Adding, subtracting dimensions can be done with reshape.
    """
+    _f16_ok = True
    check_input = False
@@ -1171,6 +1172,12 @@ class Elemwise(OpenMPOp):
        return decl, checks, alloc, loop
    def c_code(self, node, nodename, inames, onames, sub):
+        if (any(i.dtype == 'float16' for i in node.inputs) or
+                any(o.dtype == 'float16' for o in node.outputs) or
+                # This is for Composite
+                getattr(self.scalar_op, 'inner_float16', False)):
+            # Disable C code for float16 vars
+            super(Elemwise, self).c_code(node, nodename, inames, onames, sub)
        code = "\n".join(self._c_all(node, nodename, inames, onames, sub))
        return code
@@ -1186,7 +1193,7 @@ class Elemwise(OpenMPOp):
        return support_code
    def c_code_cache_version_apply(self, node):
-        version = [11]  # the version corresponding to the c code in this Op
+        version = [12]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -1806,6 +1813,7 @@ class CAReduceDtype(CAReduce):
                    uint8='uint64',
                    uint16='uint64',
                    uint32='uint64',
+                    float16='float32',
                    float32='float64',
                    complex64='complex128',
                    ).get(idtype, idtype)

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1746,7 +1746,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            # Check z is zeros_like(log(sm))
            if not _is_const(z, 0):
                return
-            if z.type not in (dmatrix, fmatrix):
+            if z.broadcastable != (False, False):
                if not (vector_softmax and z.broadcastable == (True, False)):
                    return
            # here we know that we are incrementing a matrix of zeros
@@ -1758,14 +1758,15 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes:
                return
-            # here we know that we are incrementing some part of matrix z by a vector
+            # here we know that we are incrementing some part of
+            # matrix z by a vector
-            # unless the user has taken care to mark that the data and labels have the
+            # unless the user has taken care to mark that the data and
-            # same number of rows, we cannot be sure here that
+            # labels have the same number of rows, we cannot be sure
-            # len(y) == len(z)
+            # here that len(y) == len(z) However, in the common case
-            # However, in the common case that these are predictions and labels it is true.
+            # that these are predictions and labels it is true.  We
-            # We leave it to the Op to crash (and the user to complain) if this assumption is
+            # leave it to the Op to crash (and the user to complain)
-            # ever not true.
+            # if this assumption is ever not true.
            out_grad = -incr

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1564,6 +1564,7 @@ class Assert(T.Op):
    used in the function computing the graph, but it doesn't have to be
    returned.
    """
+    __props__ = ('msg',)
    view_map = {0: [0]}
    check_input = False
@@ -1583,24 +1584,18 @@ class Assert(T.Op):
        assert numpy.all([c.type.ndim == 0 for c in cond])
        return gof.Apply(self, [value] + cond, [value.type()])
-    def __str__(self):
-        return self.__class__.__name__
    def perform(self, node, inputs, out_):
        out, = out_
        v = inputs[0]
        out[0] = v
        assert numpy.all(inputs[1:]), self.msg
-    def __eq__(self, other):
-        return type(self) == type(other) and self.msg == other.msg
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.msg)
    def grad(self, input, output_gradients):
        return output_gradients + [DisconnectedType()()] * (len(input) - 1)
+    def connection_pattern(self, node):
+        return [[1]] + [[0]] * (len(node.inputs) - 1)
    def c_code(self, node, name, inames, onames, sub):
        value = inames[0]
        out = onames[0]

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -5820,40 +5820,24 @@ def _test_autocast_custom():
    orig_autocast = autocast_float.dtypes
    # Test that autocast_float_as sets the autocast dtype correctly
-    try:  # ghetto 2.4 version of with
+    with autocast_float_as('float32'):
-        ac = autocast_float_as('float32')
-        ac.__enter__()
        assert autocast_float.dtypes == ('float32',)
-    finally:
-        ac.__exit__()
    assert autocast_float.dtypes == orig_autocast
-    try:  # ghetto 2.4 version of with
-        ac = autocast_float_as('float64')
+    with autocast_float_as('float64'):
-        ac.__enter__()
        assert autocast_float.dtypes == ('float64',)
-    finally:
-        ac.__exit__()
    assert autocast_float.dtypes == orig_autocast
    # Test that we can set it back to something, and nest it
-    try:  # ghetto 2.4 version of with
+    with autocast_float_as('float32'):
-        ac = autocast_float_as('float32')
-        ac.__enter__()
        assert autocast_float.dtypes == ('float32',)
-        try:  # ghetto 2.4 version of with
+        with autocast_float_as('float64'):
-            ac2 = autocast_float_as('float64')
-            ac2.__enter__()
            assert autocast_float.dtypes == ('float64',)
-        finally:
-            ac2.__exit__()
        assert autocast_float.dtypes == ('float32',)
-    finally:
-        ac.__exit__()
    assert autocast_float.dtypes == orig_autocast
    # Test that the autocasting dtype is used correctly in expression-building
-    try:  # ghetto 2.4 version of with
+    with autocast_float_as('float32'):
-        ac = autocast_float_as('float32')
-        ac.__enter__()
        assert (dvector() + 1.1).dtype == 'float64'
        assert (fvector() + 1.1).dtype == 'float32'
        assert (fvector() + theano._asarray(1.1, dtype='float64')).dtype == \
@@ -5863,13 +5847,9 @@ def _test_autocast_custom():
        assert (dvector() + 1).dtype == 'float64'
        assert (fvector() + 1).dtype == 'float32'
-    finally:
-        ac.__exit__()
    # Test that the autocasting dtype is used correctly in expression-building
-    try:  # ghetto 2.4 version of with
+    with autocast_float_as('float64'):
-        ac = autocast_float_as('float64')
-        ac.__enter__()
        assert (dvector() + 1.1).dtype == 'float64'
        assert (fvector() + 1.1).dtype == 'float64'
        assert (fvector() + 1.0).dtype == 'float64'
@@ -5880,13 +5860,9 @@ def _test_autocast_custom():
        assert (dvector() + 1).dtype == 'float64'
        assert (fvector() + 1).dtype == 'float32'
-    finally:
-        ac.__exit__()
    # Test that the autocasting dtype is used correctly in expression-building
-    try:  # ghetto 2.4 version of with
+    with autocast_float_as('float32', 'float64'):
-        ac = autocast_float_as('float32', 'float64')
-        ac.__enter__()
        assert (dvector() + 1.1).dtype == 'float64'
        assert (fvector() + 1.1).dtype == theano.config.floatX
        assert (fvector() + 1.0).dtype == 'float32'
@@ -5903,14 +5879,8 @@ def _test_autocast_custom():
        assert (ivector() + numpy.int8(1)).dtype == 'int32'
        assert (wvector() + numpy.int8(1)).dtype == 'int16'
        assert (bvector() + numpy.int8(1)).dtype == 'int8'
-        try:  # ghetto 2.4 version of with
+        with autocast_float_as('float64'):
-            ac2 = autocast_float_as('float64')
-            ac2.__enter__()
            assert (fvector() + 1.0).dtype == 'float64'
-        finally:
-            ac2.__exit__()
-    finally:
-        ac.__exit__()
 def _test_autocast_numpy():
@@ -6036,17 +6006,8 @@ class test_arithmetic_cast(unittest.TestCase):
                                            config.int_division == 'raise')
                                    # This is the expected behavior.
                                    continue
-                                # For numpy we have a problem:
+                                numpy_dtype = op(numpy_args[0](a_type),
-                                #   http://projects.scipy.org/numpy/ticket/1827
+                                                 numpy_args[1](b_type)).dtype
-                                # As a result we only consider the highest data
-                                # type that numpy may return.
-                                numpy_dtypes = [
-                                        op(numpy_args[0](a_type),
-                                           numpy_args[1](b_type)).dtype,
-                                        op(numpy_args[1](b_type),
-                                           numpy_args[0](a_type)).dtype]
-                                numpy_dtype = theano.scalar.upcast(
-                                        *map(str, numpy_dtypes))
                                if numpy_dtype == theano_dtype:
                                    # Same data type found, all is good!
                                    continue
@@ -6078,9 +6039,7 @@ class test_arithmetic_cast(unittest.TestCase):
                                        # Theano upcasted the result array.
                                        theano_dtype == up_type and
                                        # But Numpy kept its original type.
-                                        # (not an equality because of numpy bug
+                                        array_type == numpy_dtype):
-                                        # mentioned above).
-                                        array_type in numpy_dtypes):
                                        # Then we accept this difference in
                                        # behavior.
                                        continue
@@ -6092,17 +6051,20 @@ class test_arithmetic_cast(unittest.TestCase):
                                                 numpy.__version__.split('.')[:2]]
                                if (cfg == 'numpy+floatX' and
                                    a_type == 'complex128' and
-                                    b_type == 'float32' and
+                                    (b_type == 'float32' or
+                                     b_type == 'float16') and
                                    combo == ('scalar', 'array') and
                                    bool(numpy_version >= [1, 6]) and
                                    theano_dtype == 'complex128' and
-                                    numpy_dtypes == ['complex64',
+                                    numpy_dtype == 'complex64'):
-                                                     'complex64']):
+                                    # In numpy 1.6.x adding a
-                                    # In numpy 1.6.x adding a complex128 with
+                                    # complex128 with a float32 or
-                                    # a float32 may result in a complex64. This
+                                    # float16 may result in a
-                                    # may be a bug (investigation is currently
+                                    # complex64. This may be a bug
-                                    # in progress), so in the meantime we just
+                                    # (investigation is currently in
-                                    # mark this test as a known failure.
+                                    # progress), so in the meantime we
+                                    # just mark this test as a known
+                                    # failure.
                                    raise KnownFailureTest('Known issue with '
                                            'numpy >= 1.6.x see #761')

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -1024,6 +1024,7 @@ class T_prod_without_zeros_dtype(unittest.TestCase):
                    uint8='uint64',
                    uint16='uint64',
                    uint32='uint64',
+                    float16='float32',
                    float32='float64',
                    complex64='complex128'
                    ).get(dtype, dtype)

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -234,6 +234,7 @@ class TensorType(Type):
        # complex64, etc.
        try:
            return {
+                'float16': (float, 'npy_float16', 'NPY_FLOAT16'),
                'float32': (float, 'npy_float32', 'NPY_FLOAT32'),
                'float64': (float, 'npy_float64', 'NPY_FLOAT64'),
                'uint8': (int, 'npy_uint8', 'NPY_UINT8'),

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -6,6 +6,7 @@ from nose.plugins.skip import SkipTest
 import os
 from fnmatch import fnmatch
 import theano
+from theano.compat import PY3
 try:
    import flake8.engine
    import flake8.main
@@ -227,7 +228,6 @@ whitelist_flake8 = [
    "sandbox/gpuarray/elemwise.py",
    "sandbox/gpuarray/type.py",
    "sandbox/gpuarray/__init__.py",
-    "sandbox/gpuarray/opt.py",
    "sandbox/gpuarray/blas.py",
    "sandbox/gpuarray/kernel_codegen.py",
    "sandbox/gpuarray/conv.py",
@@ -347,6 +347,8 @@ def test_format_flake8():
    """
    if not flake8_available:
        raise SkipTest("flake8 is not installed")
+    if PY3:
+        raise SkipTest("not testing in python3 since 2to3 ran")
    total_errors = 0
    for path in list_files():
        rel_path = os.path.relpath(path, theano.__path__[0])