Merge pull request #2901 from abergeron/f16_lstm

Work to make the DLT LSTM work with float16.

Merge pull request #2901 from abergeron/f16_lstm
907d1868 · Frédéric Bastien · e06617bd · 6cee82db · 907d1868 · 907d1868
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker):
            node_input_storage = [storage_map[r] for r in node.inputs]
            node_output_storage = [storage_map[r] for r in node.outputs]

+            compute_map = {}
+            for k in node.inputs:
+                compute_map[k] = [True]
+            for k in node.outputs:
+                compute_map[k] = [False]
+
            # Some Ops define a make_thunk with the expectation that
            # it will be called before the C code is compiled, because
            # the compilation of some dependency is triggered there.
            thunk_other = None
+
            if get_unbound_function(node.op.make_thunk) not in default_make_thunk:
-                compute_map = {}
-                for k in node.inputs:
-                    compute_map[k] = [True]
-                for k in node.outputs:
-                    compute_map[k] = [False]
                thunk = node.op.make_thunk(node,
                                           storage_map,
                                           compute_map,
@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker):
                    raise utils.MethodNotDefined()
                # Ops that do not inherit from gof.op.Op don't have certain
                # methods defined that the CLinker expects (Scan is an
-                # exmaple, ifelse is another of such classes that inherit
+                # example, ifelse is another of such classes that inherit
                # directly from PureOp)
                if not isinstance(node.op, gof.op.Op):
                    raise utils.MethodNotDefined()
-                e = FunctionGraph(node.inputs, node.outputs)
-                # The toposort isn't a stochastic order as it contain only one node.
-                e.toposort = lambda: list(e.apply_nodes)
-                #  Specifically... e.nodes is a set, but of only 1 element
-
-                cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
-                                                             node.outputs)
-                                          if r2 in no_recycling])
-
-                thunk, node_input_filters, node_output_filters = cl.make_thunk(
-                    input_storage=node_input_storage,
-                    output_storage=node_output_storage)
-                thunk.inputs = node_input_storage
-                thunk.outputs = node_output_storage
+
+                thunk = node.op.make_c_thunk(node, storage_map, compute_map,
+                                             no_recycling)
                thunks_c.append(thunk)
            except (NotImplementedError, utils.MethodNotDefined):
                thunks_c.append(None)
@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker):
            # consider that we don't have a python implementation
            if ((self.maker.mode.check_py_code or thunks_c[-1] is None) and
                node.op.perform.func_code != gof.op.PureOp.perform.func_code):
-                p = node.op.perform
-                ctx = node.run_context()
-                if ctx is graph.NoContext:
-                    thunk = (lambda p=p, i=node_input_storage,
-                             o=node_output_storage,
-                             n=node: p(n, [x[0] for x in i], o))
-                else:
-                    ctx_val = node.context_type.filter(ctx)
-                    thunk = (lambda p=p, i=node_input_storage,
-                             o=node_output_storage, ctx=ctx_val,
-                             n=node: p(n, [x[0] for x in i], o, ctx))
-                thunk.inputs = node_input_storage
-                thunk.outputs = node_output_storage
-                thunk.perform = p
+                thunk = node.op.make_py_thunk(node, storage_map, compute_map,
+                                              no_recycling)
                thunks_py.append(thunk)
            else:
                thunks_py.append(None)

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op):
    ..note: works inplace and works for CudaNdarrayType
    """
    view_map = {0: [0]}
+    _f16_ok = True
    # Mapping from Type to C code (and version) to use.
    # In the C code, the name of the input variable is %(iname)s,
    # the output variable is %(oname)s.

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -699,78 +699,55 @@ class Op(utils.object2, PureOp, CLinkerOp):
        else:
            return NotImplemented

-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+    def make_c_thunk(self, node, storage_map, compute_map, no_recycling):
        """
-        :param node: something previously returned by self.make_node
-
-        :param storage_map: dict variable -> one-element-list where a computed
-                value for this variable may be found.
-
-        :param compute_map: dict variable -> one-element-list where a boolean
-                value will be found.  The boolean indicates whether the
-                variable's storage_map container contains a valid value (True)
-                or if it has not been computed yet (False).
-
-        :param no_recycling: list of variables for which it is forbidden to
-                reuse memory allocated by a previous call.
-
-        :note: If the thunk consults the storage_map on every call, it is safe
-            for it to ignore the no_recycling argument, because elements of the
-            no_recycling list will have a value of None in the storage map.  If
-            the thunk can potentially cache return values (like CLinker does),
-            then it must not do so for variables in the no_recycling list.
+        Like make_thunk, but will only try to make a C thunk.
        """
        logger = logging.getLogger('theano.gof.op.Op')

        node_input_storage = [storage_map[r] for r in node.inputs]
        node_output_storage = [storage_map[r] for r in node.outputs]
-        node_input_compute = [compute_map[r] for r in node.inputs]
-        node_output_compute = [compute_map[r] for r in node.outputs]

-        if self._op_use_c_code:
-            try:
-                # float16 get special treatment since running
-                # unprepared C code will get bad results.
-                if not getattr(self, '_f16_ok', False):
-                    def is_f16(t):
-                        return getattr(t, 'dtype', '') == 'float16'
-
-                    if (any(is_f16(i.type) for i in node.inputs) or
-                            any(is_f16(o.type) for o in node.outputs)):
-                        print ("Disabling C code for %s due to unsupported "
-                               "float16" % (self,))
-                        raise NotImplementedError("float16")
-                e = FunctionGraph(node.inputs, node.outputs)
-
-                e_no_recycling = [new_o
-                        for (new_o, old_o) in zip(e.outputs, node.outputs)
-                        if old_o in no_recycling]
-                cl = theano.gof.cc.CLinker().accept(e,
-                        no_recycling=e_no_recycling)
-
-                logger.debug('Trying CLinker.make_thunk')
-                outputs = cl.make_thunk(input_storage=node_input_storage,
-                                        output_storage=node_output_storage)
-                fill_storage, node_input_filters, node_output_filters = outputs
-
-                def rval():
-                    fill_storage()
-                    for o in node.outputs:
-                        compute_map[o][0] = True
-
-                rval.cthunk = fill_storage.cthunk
-                rval.inputs = node_input_storage
-                rval.outputs = node_output_storage
-                rval.lazy = False
-                return rval
-                # the next line does nothing, but pyflakes is too
-                # stupid to realize the def rval below is not a
-                # redefinition unless I include this
-                del rval
-            except (NotImplementedError, utils.MethodNotDefined):
-                logger.debug('Falling back on perform')
+        # float16 gets special treatment since running
+        # unprepared C code will get bad results.
+        if not getattr(self, '_f16_ok', False):
+            def is_f16(t):
+                return getattr(t, 'dtype', '') == 'float16'
+
+            if (any(is_f16(i.type) for i in node.inputs) or
+                any(is_f16(o.type) for o in node.outputs)):
+                print ("Disabling C code for %s due to unsupported "
+                       "float16" % (self,))
+                raise NotImplementedError("float16")
+        e = FunctionGraph(node.inputs, node.outputs)
+        e_no_recycling = [new_o
+                          for (new_o, old_o) in zip(e.outputs, node.outputs)
+                          if old_o in no_recycling]
+        cl = theano.gof.cc.CLinker().accept(e,
+                                            no_recycling=e_no_recycling)
+
+        logger.debug('Trying CLinker.make_thunk')
+        outputs = cl.make_thunk(input_storage=node_input_storage,
+                                output_storage=node_output_storage)
+        fill_storage, node_input_filters, node_output_filters = outputs
+
+        def rval():
+            fill_storage()
+            for o in node.outputs:
+                compute_map[o][0] = True
+
+        rval.cthunk = fill_storage.cthunk
+        rval.inputs = node_input_storage
+        rval.outputs = node_output_storage
+        rval.lazy = False
+        return rval

-        # condition: either there was no c_code, or it failed
+    def make_py_thunk(self, node, storage_map, compute_map, no_recycling):
+        """
+        Like make_thunk() but only makes python thunks.
+        """
+        node_input_storage = [storage_map[r] for r in node.inputs]
+        node_output_storage = [storage_map[r] for r in node.outputs]

        p = node.op.perform

@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp):
        rval.lazy = False
        return rval

+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        """
+        :param node: something previously returned by self.make_node
+
+        :param storage_map: dict variable -> one-element-list where a computed
+                value for this variable may be found.
+
+        :param compute_map: dict variable -> one-element-list where a boolean
+                value will be found.  The boolean indicates whether the
+                variable's storage_map container contains a valid value (True)
+                or if it has not been computed yet (False).
+
+        :param no_recycling: list of variables for which it is forbidden to
+                reuse memory allocated by a previous call.
+
+        :note: If the thunk consults the storage_map on every call, it is safe
+            for it to ignore the no_recycling argument, because elements of the
+            no_recycling list will have a value of None in the storage map.  If
+            the thunk can potentially cache return values (like CLinker does),
+            then it must not do so for variables in the no_recycling list.
+        """
+        logger = logging.getLogger('theano.gof.op.Op')
+
+        if self._op_use_c_code:
+            try:
+                return self.make_c_thunk(node, storage_map, compute_map,
+                                         no_recycling)
+            except (NotImplementedError, utils.MethodNotDefined):
+                logger.debug('Falling back on perform')
+
+        # condition: either there was no c_code, or it failed
+        return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
+

 def get_test_value(v):
    """

--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            ]


-@code_version((1,))
-def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+@code_version((2,))
+def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                               manner_fn, manner_init,
-                               b='', stride_b='', dtype='float32'):
+                               b='', stride_b='', load_b='', dtype='float32'):
    """Return C++ code for a function that reduces a contiguous buffer.

    :param N: length of the buffer
    :param buf: buffer pointer of size warpSize * sizeof(dtype)
+    :param x: input data
+    :param stride_x: input data stride
+    :param load_x: wrapper to read from x
    :param pos: index of executing thread
    :param count: number of executing threads
    :param b: Optional, pointer to the bias
    :param stride_b: Optional, the stride of b if b is provided
+    :param load_b: Optional, wrapper to read from b if b is provided
    :param dtype: Optional, the dtype of the output

    :param manner_fn: a function that accepts strings of arguments a
@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,

    """
    if b:
-        init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
-                           " %(b)s[%(pos)s * %(stride_b)s]" % locals())
+        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
+                           " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
        loop_line = manner_fn("red",
-                              manner_init("%(x)s[i * %(stride_x)s] + "
-                                          "%(b)s[i * %(stride_b)s]" %
+                              manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
+                                          "%(load_b)s(%(b)s[i * %(stride_b)s])" %
                                          locals()))
    else:
-        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
-        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
+        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
+        loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
                                                 locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
                          "%s[i]" % buf)
@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,


 @code_version(inline_reduce_fixed_shared.code_version)
-def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
-                                   b='', stride_b='', dtype='float32'):
-    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+def inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, pos, count,
+                                   b='', stride_b='', load_b='',
+                                   dtype='float32'):
+    return inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                                      lambda a, b: "max(%s, %s)" % (a, b),
                                      lambda a: a,
-                                      b, stride_b, dtype)
+                                      b, stride_b, load_b, dtype)


-@code_version((1,) + inline_reduce_max.code_version +
+@code_version((2,) + inline_reduce_max.code_version +
              inline_reduce_sum.code_version)
-def inline_softmax_fixed_shared(N, buf, x, stride_x,
-                                sm, sm_stride,
+def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
+                                sm, sm_stride, write_sm,
                                threadPos, threadCount,
-                                b='', stride_b='', dtype="float32"):
+                                b='', stride_b='', load_b='',
+                                dtype="float32"):
    """

    :param N: length of the buffer, atleast waprSize(32).
    :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
    :param x: a ptr to the gpu memory where the row is stored
    :param stride_x: the stride between each element in x
+    :param load_x: wrapper to read from x
    :param sm: a ptr to the gpu memory to store the result
    :param sm_stride: the stride between eash sm element
+    :param write_sm: wrapper before writing to sm
    :param threadPos: index of executing thread
    :param threadCount: number of executing threads
    :param b: Optional, pointer to the bias
    :param stride_b: Optional, the stride of b if b is provided
+    :param load_b: Optional, wrapper to read from b if b is provided
    :param dtype: Optional, the dtype of the softmax's output if not float32

    :Precondition: buf is empty
@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
    """
    ret = [
        # get max of buf (trashing all but buf[0])
-        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
-                                       threadPos, threadCount, b, stride_b,
+        inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
+                                       threadPos, threadCount,
+                                       b, stride_b, load_b,
                                       dtype),
        '__syncthreads()',
        ('npy_%s row_max = ' + buf + '[0]') % dtype,
        '__syncthreads()',
-        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
+        inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
+                                   threadPos, threadCount,
                                   lambda a, b: "%s + %s" % (a, b),
                                   lambda a: "exp(%s - row_max)" % a,
-                                   b, stride_b, dtype),
+                                   b, stride_b, load_b, dtype),
        '__syncthreads()',
        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
        '__syncthreads()',
@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
    if b:
        ret += [
            "%(sm)s[tx * %(sm_stride)s] = "
-            "  exp(%(x)s[tx * %(stride_x)s] +"
-            "            %(b)s[tx * %(stride_b)s] - row_max)"
-            " / row_sum" % locals()]
+            "  %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
+            "            %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
+            " / row_sum)" % locals()]
    else:
        ret += [
            "%(sm)s[tx * %(sm_stride)s] = "
-            "exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
+            "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
+            " / row_sum)" % locals()]
    ret += [
        "}",
        '__syncthreads()',

--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor):
          The helper methods like do_type_checking, copy_of_x, etc. specialize
          the c_code for this Op.
    """
+    @property
+    def _f16_ok(self):
+        return self.iadd_node.op._f16_ok
+
    def c_headers(self):
        return self.iadd_node.op.c_headers()

@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor):
                                                    PyGpuArrayObject* src){
           PyGpuArrayObject* ret = NULL;
        """ % locals()
-        # def c_code(self, node, name, inputs, outputs, sub):
        inputs = ["dst", "src"]
        outputs = ["ret"]
        sub = {"fail": "return NULL;"}
@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor):
        return ret

    def add_to_zview(self, nodename, x, fail):
-        # TODO
        return """
        PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);

@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
        elemwise_version = self.iadd_node.c_code_cache_version()
        if not parent_version or not elemwise_version:
            return
-        return parent_version + elemwise_version + (1,)
+        return parent_version + elemwise_version + (2,)


 class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

    def getInplElemwiseAdditionKernel(self, a, b):
+        if a.dtype == 'float16' or b.dtype == 'float16':
+            raise NotImplementedError('float16 is not supported by pygpu '
+                                      'elemwise')
        a_arg = pygpu.tools.as_argument(a, 'a')
        b_arg = pygpu.tools.as_argument(b, 'b')
        args = [a_arg, b_arg]
@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
    """Implement AdvancedIncSubtensor1 on the gpu, but use function
    only avail on compute capability 2.0 and more recent.
    """
-
-    def __init__(self, inplace=False, set_instead_of_inc=False):
-        # The python implementation in the parent class is not applicable here
-        GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
+    _f16_ok = True

    def make_node(self, x, y, ilist):
        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        itemsize_out = numpy.dtype(dtype_out).itemsize
        return """

+/*
+ * This is a version of atomicAdd that works for half-floats.  It may
+ * read and write 2 bytes more than the size of the array if the array
+ * has an uneven number of elements.  The actual value at that spot
+ * will not be modified.
+ */
+
+__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
+  npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
+  npy_uint32 old, assumed, sum, new_;
+  old = *base;
+  do {
+    assumed = old;
+    sum = __float2half_rn(
+      __half2float(val) +
+      __half2float((npy_float16)__byte_perm(old, 0,
+                     ((size_t)addr & 2) ? 0x4432 : 0x4410)));
+    new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
+    old = atomicCAS(base, assumed, new_);
+  } while (assumed != old);
+  return (npy_float16)__byte_perm(old, 0,
+                                  ((size_t)addr & 2) ? 0x4432 : 0x4410);
+}
+
        __global__ void k_vector_add_fast(int numRowsX,
                                          int numColsX,
                                          int stridesX0,

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -136,6 +136,12 @@ class GpuArrayType(Type):
                raise NotImplementedError(
                    "GpuArrayType.values_eq_approx() don't implemented the"
                    " allow_remove_inf and allow_remove_nan parameter")
+            if a.dtype == 'float16' or b.dtype == 'float16':
+                an = numpy.asarray(a)
+                bn = numpy.asarray(b)
+                return tensor.TensorType.values_eq_approx(
+                    an, bn, allow_remove_inf=allow_remove_inf,
+                    allow_remove_nan=allow_remove_nan, rtol=rtol, atol=atol)
            narrow = 'float32', 'complex64'
            if (str(a.dtype) in narrow) or (str(b.dtype) in narrow):
                atol_ = theano.tensor.basic.float32_atol
@@ -153,6 +159,13 @@ class GpuArrayType(Type):
                            locals())
            return numpy.asarray(res).all()

+    @staticmethod
+    def may_share_memory(a, b):
+        if (not isinstance(a, gpuarray.GpuArray) or
+               not isinstance(b, gpuarray.GpuArray)):
+            return False
+        return pygpu.gpuarray.may_share_memory(a, b)
+
    def value_zeros(self, shape):
        return pygpu.gpuarray.zeros(shape, dtype=self.typecode)


--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -28,6 +28,7 @@ if cuda_available:

 from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
 from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.fp16_help import write_w


 def matVecModM(A, s, m):
@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base):
    def perform(self, node, inp, out):
        rstate, size = inp
        o_rstate, o_sample = out
-        numpy_version = numpy.__version__.split('.')
-
-        if (not self.warned_numpy_version and
-            int(numpy_version[0]) <= 1 and
-            int(numpy_version[1]) < 3):
-
-            print("Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy")
-            self.warned_numpy_version = True
-
        n_elements = 1

        rstate = numpy.asarray(rstate)  # bring state from GPU if necessary
@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base):

    def c_code(self, node, name, inp, out, sub):
        rstate, size = inp
+        # If we try to use the C code here with something else than a
+        # TensorType, something is wrong (likely one of the GPU ops
+        # not defining C code correctly).
+        assert isinstance(node.inputs[0].type, TensorType)
        o_rstate, o_sample = out
        if self.inplace:
            o_rstate_requirement = 'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED'
@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):

 class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
    # GpuArray version
+    _f16_ok = True

    @classmethod
    def new(cls, rstate, ndim, dtype, size):
@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']

    def gpu_kernels(self, node, name):
-        if self.output_type.dtype == 'float32':
+        write = write_w(self.output_type.dtype)
+        if self.output_type.dtype == 'float16':
+            otype = 'ga_half'
+            # limit the values of the state that we use.
+            mask = '& 0x7fff'
+            NORM = '3.0518e-05f'  # numpy.float16(1.0/(2**15+8))
+            # this was determined by finding the biggest number such that
+            # numpy.float16(number * (M1 & 0x7fff)) < 1.0
+        elif self.output_type.dtype == 'float32':
            otype = 'float'
+            mask = ''
            NORM = '4.6566126e-10f'  # numpy.float32(1.0/(2**31+65))
            # this was determined by finding the biggest number such that
            # numpy.float32(number * M1) < 1.0
-        else:
+        elif self.output_type.dtype == 'float64':
            otype = 'double'
+            mask = ''
            NORM = '4.656612873077392578125e-10'
+        else:
+            raise ValueError('Unsupported data type for output',
+                             self.output_type.dtype)
        code = """
        KERNEL void mrg_uniform(
                GLOBAL_MEM %(otype)s *sample_data,
@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
                x21 = y2;

                if (x11 <= x21) {
-                    sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
+                    sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
                }
                else
                {
-                    sample_data[i] = (x11 - x21) * %(NORM)s;
+                    sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
                }
            }

@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
        fail = sub['fail']
        kname = self.gpu_kernels(node, nodename)[0].objvar
-
-        if self.output_type.dtype == 'float32':
-            otype = 'float'
-            otypecode = 'GA_FLOAT'
-        else:
-            otype = 'double'
-            otypecode = 'GA_DOUBLE'
+        otypecode = str(self.output_type.typecode)

        return """
-        //////// <code generated by mrg_uniform>
-
        size_t odims[%(ndim)s];
        unsigned int n_elements = 1;
        unsigned int n_streams;
@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
              %(fail)s
          }
        }
-
-        //////// </ code generated by mrg_uniform>
        """ % locals()

    def c_code_cache_version(self):
-        return (3, self.GpuKernelBase_version)
+        return (6, self.GpuKernelBase_version)


 def guess_n_streams(size, warn=False):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3842,6 +3842,7 @@ class Reshape(Op):
    The number of dimensions to which to reshape to (ndim) must be
    known at graph build time."""
    view_map = {0: [0]}  # output 0 is potentially aliased to inputs [0]
+    _f16_ok = True

    check_input = False


--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
        # We add boundary checks prevent exp from generating inf or
        # 0. The reset of the logic always generate 0 or 1 in those
        # cases. This is a speed optimization.
-        # The constants were obtained by looking at the output of python commands like:
-        """
-import numpy, theano
-dt='float32'  # or float64
-for i in xrange(750):
-    print i, repr(theano._asarray(1.0, dtype=dt) /
-                              (theano._asarray(1.0, dtype=dt) +
-                              numpy.exp(-theano._asarray([i,-i], dtype=dt))))
-        """
-        if node.inputs[0].type == scalar.float32:
+        # The constants were obtained by looking at the output of
+        # python commands like:
+        #
+        # import numpy, theano
+        # dt='float32'  # or float64
+        # for i in xrange(750):
+        #     print i, repr(theano._asarray(1.0, dtype=dt) /
+        #                   (theano._asarray(1.0, dtype=dt) +
+        #                    numpy.exp(-theano._asarray([i,-i], dtype=dt))))
+
+        # float16 limits: -11.0, 7.0f
+        # We use the float32 limits for float16 for now as the
+        # computation will happend in float32 anyway.
+        if (node.inputs[0].type == scalar.float32 or
+                node.inputs[0].type == scalar.float16):
            return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
        elif node.inputs[0].type == scalar.float64:
            return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
    def c_code(self, node, name, inp, out, sub):
        x, = inp
        z, = out
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
+        # These constants were obtained by looking at the output of
+        # python commands like:
+        #  for i in xrange(750):
+        #      print i, repr(numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
+        # the boundary checks prevent us from generating inf
+
+        # float16 limits: -17.0, 6.0
+        # We use the float32 limits for float16 for now as the
+        # computation will happend in float32 anyway.
+        if (node.inputs[0].type == scalar.float32 or
+                node.inputs[0].type == scalar.float16):
            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
        elif node.inputs[0].type == scalar.float64:
            return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -5151,7 +5151,8 @@ def local_log_erfc(node):
                   T.log(1 - 1 / (2 * x ** 2) + 3 / (4 * x ** 4)
                         - 15 / (8 * x ** 6)))

-    if node.outputs[0].dtype == 'float32':
+    if (node.outputs[0].dtype == 'float32' or
+            node.outputs[0].dtype == 'float16'):
        threshold = 10.0541949
    elif node.outputs[0].dtype == 'float64':
        threshold = 26.641747557
@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node):
                            3 / (4 * (x ** 4)) - 15 / (8 * (x ** 6)), -1)
                  * T.cast(T.sqrt(numpy.pi), dtype=x.dtype))

-    if x.dtype == 'float32':
+    if x.dtype == 'float32' or x.dtype == 'float16':
        threshold = 9.3
        #threshold = 10.1
    elif x.dtype == 'float64':

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -291,6 +291,7 @@ class Subtensor(Op):
    debug = 0
    check_input = False
    view_map = {0: [0]}
+    _f16_ok = True

    @staticmethod
    def collapse(idxs, cond):
@@ -328,7 +329,7 @@ class Subtensor(Op):
        TODO: WRITEME: This method also accepts "entry" already being a Type;
            when would that happen?
        """
-        invalid_scal_types = [scal.float64, scal.float32]
+        invalid_scal_types = [scal.float64, scal.float32, scal.float16]
        scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
        tensor_types = [theano.tensor.lscalar, theano.tensor.iscalar,
                        theano.tensor.wscalar, theano.tensor.bscalar]
@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op):
    # sparse_grad doesn't go in here since it only affects the output
    # of the grad() method.
    __props__ = ()
+    _f16_ok = True

    def __init__(self, sparse_grad=False):
        self.sparse_grad = sparse_grad