Merge pull request #5923 from nouiz/float16

don't move op that don't support float16 on the GPU

Merge pull request #5923 from nouiz/float16
4a8fed96 · Frédéric Bastien · GitHub · ebc59491 · dc7f7a4f · 4a8fed96
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -50,6 +50,7 @@ class ViewOp(gof.Op):
    # the output variable is %(oname)s.
    c_code_and_version = {}
    __props__ = ()
+    _f16_ok = True

    def make_node(self, x):
        return gof.Apply(self, [x], [x.type()])
@@ -151,6 +152,7 @@ class DeepCopyOp(gof.Op):

    check_input = False
    __props__ = ()
+    _f16_ok = True

    def __init__(self):
        pass
@@ -659,6 +661,7 @@ class Rebroadcast(gof.Op):

    check_input = False
    __props__ = ("axis",)
+    _f16_ok = True

    def __init__(self, *axis):
        # Sort them to make sure we merge all possible case.
@@ -820,6 +823,7 @@ class SpecifyShape(gof.Op):
    # the output variable is %(oname)s.
    c_code_and_version = {}
    __props__ = ()
+    _f16_ok = True

    def make_node(self, x, shape):
        if not isinstance(x, gof.Variable):

--- a/theano/gof/cutils.py
+++ b/theano/gof/cutils.py
@@ -14,189 +14,6 @@ if os.path.exists(os.path.join(config.compiledir, 'cutils_ext.so')):
    os.remove(os.path.join(config.compiledir, 'cutils_ext.so'))


-def compile_cutils_code():
-    types = ['npy_' + t for t in ['int8', 'int16', 'int32', 'int64', 'int128',
-                                  'int256', 'uint8', 'uint16', 'uint32',
-                                  'uint64', 'uint128', 'uint256',
-                                  'float16', 'float32', 'float64',
-                                  'float80', 'float96', 'float128',
-                                  'float256']]
-
-    complex_types = ['npy_' + t for t in ['complex32', 'complex64',
-                                          'complex128', 'complex160',
-                                          'complex192', 'complex512']]
-
-    inplace_map_template = """
-    #if defined(%(typen)s)
-    static void %(type)s_inplace_add(PyArrayMapIterObject *mit,
-                                     PyArrayIterObject *it, int inc_or_set)
-    {
-        int index = mit->size;
-        while (index--) {
-            %(op)s
-
-            PyArray_MapIterNext(mit);
-            PyArray_ITER_NEXT(it);
-        }
-    }
-    #endif
-    """
-
-    floatadd = ("((%(type)s*)mit->dataptr)[0] = "
-                "(inc_or_set ? ((%(type)s*)mit->dataptr)[0] : 0)"
-                " + ((%(type)s*)it->dataptr)[0];")
-    complexadd = """
-    ((%(type)s*)mit->dataptr)[0].real =
-        (inc_or_set ? ((%(type)s*)mit->dataptr)[0].real : 0)
-        + ((%(type)s*)it->dataptr)[0].real;
-    ((%(type)s*)mit->dataptr)[0].imag =
-        (inc_or_set ? ((%(type)s*)mit->dataptr)[0].imag : 0)
-        + ((%(type)s*)it->dataptr)[0].imag;
-    """
-
-    fns = ''.join([inplace_map_template % {'type': t, 'typen': t.upper(),
-                                           'op': floatadd % {'type': t}}
-                   for t in types] +
-                  [inplace_map_template % {'type': t, 'typen': t.upper(),
-                                           'op': complexadd % {'type': t}}
-                   for t in complex_types])
-
-    def gen_binop(type, typen):
-        return """
-#if defined(%(typen)s)
-%(type)s_inplace_add,
-#endif
-""" % dict(type=type, typen=typen)
-
-    fn_array = ("static inplace_map_binop addition_funcs[] = {" +
-                ''.join([gen_binop(type=t, typen=t.upper())
-                         for t in types + complex_types]) + "NULL};\n")
-
-    def gen_num(typen):
-        return """
-#if defined(%(typen)s)
-%(typen)s,
-#endif
-""" % dict(type=type, typen=typen)
-
-    type_number_array = ("static int type_numbers[] = {" +
-                         ''.join([gen_num(typen=t.upper())
-                                  for t in types + complex_types]) + "-1000};")
-
-    code = ("""
-        #if NPY_API_VERSION >= 0x00000008
-        typedef void (*inplace_map_binop)(PyArrayMapIterObject *,
-                                          PyArrayIterObject *, int inc_or_set);
-        """ + fns + fn_array + type_number_array + """
-static int
-map_increment(PyArrayMapIterObject *mit, PyObject *op,
-              inplace_map_binop add_inplace, int inc_or_set)
-{
-    PyArrayObject *arr = NULL;
-    PyArrayIterObject *it;
-    PyArray_Descr *descr;
-    if (mit->ait == NULL) {
-        return -1;
-    }
-    descr = PyArray_DESCR(mit->ait->ao);
-    Py_INCREF(descr);
-    arr = (PyArrayObject *)PyArray_FromAny(op, descr,
-                                0, 0, NPY_ARRAY_FORCECAST, NULL);
-    if (arr == NULL) {
-        return -1;
-    }
-    if ((mit->subspace != NULL) && (mit->consec)) {
-        PyArray_MapIterSwapAxes(mit, (PyArrayObject **)&arr, 0);
-        if (arr == NULL) {
-            return -1;
-        }
-    }
-    it = (PyArrayIterObject*)
-            PyArray_BroadcastToShape((PyObject*)arr, mit->dimensions, mit->nd);
-    if (it  == NULL) {
-        Py_DECREF(arr);
-        return -1;
-    }
-
-    (*add_inplace)(mit, it, inc_or_set);
-
-    Py_DECREF(arr);
-    Py_DECREF(it);
-    return 0;
-}
-
-
-static PyObject *
-inplace_increment(PyObject *dummy, PyObject *args)
-{
-    PyObject *arg_a = NULL, *index=NULL, *inc=NULL;
-    int inc_or_set = 1;
-    PyArrayObject *a;
-    inplace_map_binop add_inplace = NULL;
-    int type_number = -1;
-    int i = 0;
-    PyArrayMapIterObject * mit;
-
-    if (!PyArg_ParseTuple(args, "OOO|i", &arg_a, &index,
-            &inc, &inc_or_set)) {
-        return NULL;
-    }
-    if (!PyArray_Check(arg_a)) {
-        PyErr_SetString(PyExc_ValueError,
-                        "needs an ndarray as first argument");
-        return NULL;
-    }
-
-    a = (PyArrayObject *) arg_a;
-
-    if (PyArray_FailUnlessWriteable(a, "input/output array") < 0) {
-        return NULL;
-    }
-
-    if (PyArray_NDIM(a) == 0) {
-        PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed.");
-        return NULL;
-    }
-    type_number = PyArray_TYPE(a);
-
-
-
-    while (type_numbers[i] >= 0 && addition_funcs[i] != NULL){
-        if (type_number == type_numbers[i]) {
-            add_inplace = addition_funcs[i];
-            break;
-        }
-        i++ ;
-    }
-
-    if (add_inplace == NULL) {
-        PyErr_SetString(PyExc_TypeError, "unsupported type for a");
-        return NULL;
-    }
-    mit = (PyArrayMapIterObject *) PyArray_MapIterArray(a, index);
-    if (mit == NULL) {
-        goto fail;
-    }
-    if (map_increment(mit, inc, add_inplace, inc_or_set) != 0) {
-        goto fail;
-    }
-
-    Py_DECREF(mit);
-
-    Py_INCREF(Py_None);
-    return Py_None;
-
-fail:
-    Py_XDECREF(mit);
-
-    return NULL;
-}
-        #endif
-""")
-
-    return code
-
-
 def compile_cutils():
    """
    Do just the compilation of cutils_ext.
@@ -204,7 +21,6 @@ def compile_cutils():
    """
    code = ("""
        #include <Python.h>
-        #include "numpy/arrayobject.h"
        #include "theano_mod_helper.h"

        extern "C"{
@@ -226,18 +42,10 @@ def compile_cutils():
          int failure = fn(it);

          return Py_BuildValue("i", failure);
-         }""")
-
-    code += compile_cutils_code()
-
-    code += ("""static PyMethodDef CutilsExtMethods[] = {
+         }
+         static PyMethodDef CutilsExtMethods[] = {
            {"run_cthunk",  run_cthunk, METH_VARARGS|METH_KEYWORDS,
             "Run a theano cthunk."},
-            #if NPY_API_VERSION >= 0x00000008
-            {"inplace_increment",  inplace_increment,
-              METH_VARARGS,
-             "increments a numpy array inplace at the passed indexes."},
-            #endif
            {NULL, NULL, 0, NULL}        /* Sentinel */
        };""")
    if PY3:
@@ -256,7 +64,6 @@ def compile_cutils():

        PyMODINIT_FUNC
        PyInit_cutils_ext(void) {
-            import_array();
            return PyModule_Create(&moduledef);
        }
        }
@@ -266,7 +73,6 @@ def compile_cutils():
        PyMODINIT_FUNC
        initcutils_ext(void)
        {
-          import_array();
          (void) Py_InitModule("cutils_ext", CutilsExtMethods);
        }
    } //extern C

--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -261,7 +261,7 @@ class GpuCholesky(Op):
            raise RuntimeError('CUSOLVER is not available and '
                               'GpuCholesky Op can not be constructed.')
        if skcuda.__version__ <= '0.5.1':
-            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
+            warnings.warn('The GpuCholesky op requires scikit-cuda > 0.5.1 to work with CUDA 8')
        if not pygpu_available:
            raise RuntimeError('Missing pygpu or triu/tril functions.'
                               'Install or update libgpuarray.')
@@ -382,6 +382,7 @@ class GpuMagmaSVD(COp):
        A = as_gpuarray_variable(A, ctx_name)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
+        assert A.dtype == 'float32'
        if self.compute_uv:
            return theano.Apply(self, [A],
                                [A.type(),
@@ -476,6 +477,7 @@ class GpuMagmaMatrixInverse(COp):
    def make_node(self, x):
        ctx_name = infer_context_name(x)
        x = as_gpuarray_variable(x, ctx_name)
+        assert x.dtype == 'float32'
        if x.ndim != 2:
            raise LinAlgError("Matrix rank error")
        return theano.Apply(self, [x], [x.type()])

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1181,6 +1181,14 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
 @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 @register_opt2([tensor.blas.Gemv], 'fast_compile')
 def local_gpua_gemv(op, context_name, inputs, outputs):
+    if inputs[0].dtype == 'float16':
+        # Use gemm implementation as cublas gemv don't support float16
+        return gpugemm_no_inplace(inputs[0][:, None],
+                                  inputs[1],
+                                  inputs[2],
+                                  inputs[3][:, None],
+                                  inputs[4]).dimshuffle(0)
+
    if inputs[0].dtype not in ['float32', 'float64']:
        return
    if op.inplace:
@@ -1351,6 +1359,8 @@ theano.tensor.nnet.conv2d()
 @op_lifter([SparseBlockGemv])
 @register_opt2([SparseBlockGemv], 'fast_compile')
 def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
+    if inputs[0].dtype == 'float16':
+        return
    if op.inplace:
        return gpu_sparse_block_gemv_inplace
    else:
@@ -1361,6 +1371,8 @@ def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
 @op_lifter([SparseBlockOuter])
 @register_opt2([SparseBlockOuter], 'fast_compile')
 def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
+    if inputs[0].dtype == 'float16':
+        return
    if op.inplace:
        return gpu_sparse_block_outer_inplace
    else:
@@ -1998,7 +2010,13 @@ def _scan_type_infer(node):
 @op_lifter([tensor.MaxAndArgmax])
 @register_opt2([tensor.MaxAndArgmax], 'fast_compile')
 def local_gpu_maxandargmax(op, context_name, inputs, outputs):
-    return GpuMaxAndArgmax(op.get_params(None))
+    op = GpuMaxAndArgmax(op.get_params(None))
+    if inputs[0].dtype == "float16":
+        # For now it is better to copy/cast on the GPU then transfer to the CPU
+        casted_inputs = inputs[0].astype('float32')
+        ret = op(casted_inputs)
+        return [ret[0].astype('float16'), ret[1]]
+    return op


 # solve
@@ -2008,9 +2026,15 @@ def local_gpu_maxandargmax(op, context_name, inputs, outputs):
 def local_gpu_solve(op, context_name, inputs, outputs):
    if not cusolver_available:
        return
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
    if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
        return
-    return GpuCusolverSolve(A_structure=op.A_structure)
+    op = GpuCusolverSolve(A_structure=op.A_structure)
+    if inputs[0].dtype == 'float16':
+        return op(inputs[0].astype('float32'),
+                  inputs[1].astype('float32')).astype('float16')
+    return op


 @register_inplace()
@@ -2028,7 +2052,13 @@ def local_inplace_gpu_solve(node):
 def local_gpu_cholesky(op, context_name, inputs, outputs):
    if not cusolver_available:
        return
-    return GpuCholesky(lower=op.lower, inplace=op.destructive)
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    op = GpuCholesky(lower=op.lower, inplace=op.destructive)
+    if inputs[0].dtype == 'float16':
+        return op(inputs[0].astype('float32')).astype('float16')
+
+    return op


 @register_inplace()
@@ -2044,7 +2074,12 @@ def local_inplace_cholesky(node):
 def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
    if not config.magma.enabled:
        return
-    return GpuMagmaMatrixInverse()
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    op = GpuMagmaMatrixInverse()
+    if inputs[0].dtype == 'float16':
+        return op(inputs[0].astype('float32')).astype('float16')
+    return op


 @register_inplace()
@@ -2061,9 +2096,13 @@ def local_inplace_matrix_inverse_inplace(node):
 def local_gpu_svd(op, context_name, inputs, outputs):
    if not config.magma.enabled:
        return
-    return GpuMagmaSVD(full_matrices=op.full_matrices,
-                       compute_uv=op.compute_uv)
-
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    op = GpuMagmaSVD(full_matrices=op.full_matrices,
+                     compute_uv=op.compute_uv)
+    if inputs[0].dtype == 'float16':
+        return op(inputs[0].astype('float32')).astype('float16')
+    return op

 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -232,7 +232,7 @@ class GpuIncSubtensor(IncSubtensor):
            if not self.set_instead_of_inc:
                # sub_x += y
                iadd = get_iadd(node.inputs[0], node.inputs[1])
-                iadd(sub_x, y, broadcast=False)
+                iadd(sub_x, y)
            else:
                # sub_x[...] = y
                x.__setitem__(cdata, y)
@@ -403,6 +403,8 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
    """
    AdvancedSubrensor1 on the GPU.
    """
+    _f16_ok = True
+
    def make_node(self, x, ilist):
        ctx_name = infer_context_name(x, ilist)
        x_ = as_gpuarray_variable(x, ctx_name)
@@ -807,7 +809,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
        """
        ctx_name = infer_context_name(x, y, ilist)
        x_ = as_gpuarray_variable(x, ctx_name)
-        y_ = as_gpuarray_variable(y, ctx_name)
+        y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name)
        ilist_ = as_gpuarray_variable(ilist, ctx_name)

        assert x_.type.ndim >= y_.type.ndim
@@ -1088,6 +1090,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {

 class GpuExtractDiag(Op):
    __props__ = ("offset", "axis1", "axis2", "view")
+    _f16_ok = True

    def __init__(self, offset=0, axis1=0, axis2=1, view=False):
        self.view = view

--- a/theano/gpuarray/tests/test_blas.py
+++ b/theano/gpuarray/tests/test_blas.py
@@ -8,7 +8,7 @@ import theano
 from theano import config
 from theano import tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
+from theano.tensor.blas import gemv, gemv_inplace, gemm_inplace, _dot22, batched_dot
 from theano.tensor.tests.test_blas import TestGer, BaseGemv

 from .. import gpuarray_shared_constructor
@@ -18,7 +18,7 @@ from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace, gpugemm_no_inplace,
                    gpugemmbatch_no_inplace,
                    gpuger_inplace, gpuger_no_inplace,
-                    GpuGer, gpu_dot22)
+                    GpuGer, GpuGemm, gpu_dot22)


 GpuGemvTester = makeTester(
@@ -42,6 +42,22 @@ GpuGemvTester = makeTester(


 def test_float16():
+    # gemv (gemm called)
+    float16_data = [rand(3).astype('float16'),
+                    np.asarray(1, dtype=np.float32),
+                    rand(3, 3).astype('float16'),
+                    rand(3).astype('float16'),
+                    np.asarray(0.5, dtype=np.float32)]
+    float16_shared = [gpuarray_shared_constructor(val, target=test_ctx_name)
+                      for val in float16_data]
+    o = gemv(*float16_shared)
+    f = theano.function([], o, mode=mode_with_gpu)
+    y, alpha, A, x, beta = float16_data
+    out = f()
+    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
+    topo = f.maker.fgraph.toposort()
+    assert any([isinstance(n.op, GpuGemm) for n in topo])
+
    # gemm
    float16_data = [rand(3, 3).astype('float16'),
                    np.asarray(1, dtype=np.float32),

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -49,6 +49,33 @@ class G_subtensor(test_subtensor.T_subtensor):
        assert self.sub == GpuSubtensor


+class G_subtensorF16(test_subtensor.T_subtensor):
+    def shortDescription(self):
+        return None
+
+    def __init__(self, name):
+        def shared(x, **kwargs):
+            return gpuarray_shared_constructor(x, target=test_ctx_name,
+                                               **kwargs)
+
+        test_subtensor.T_subtensor.__init__(
+            self, name,
+            shared=shared,
+            sub=GpuSubtensor,
+            inc_sub=GpuIncSubtensor,
+            adv_sub1=GpuAdvancedSubtensor1,
+            adv_incsub1=GpuAdvancedIncSubtensor1,
+            dimshuffle=GpuDimShuffle,
+            mode=mode_with_gpu,
+            # avoid errors with limited devices
+            dtype='float16',  # use floatX?
+            ignore_topo=(HostFromGpu, GpuFromHost,
+                         DeepCopyOp, GpuContiguous))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
+        assert self.sub == GpuSubtensor
+
+
 def test_advinc_subtensor1():
    # Test the second case in the opt local_gpu_advanced_incsubtensor1
    for shp in [(3, 3), (3, 3, 3)]:
@@ -73,7 +100,9 @@ def test_advinc_subtensor1():
 def test_advinc_subtensor1_dtype():
    # Test the mixed dtype case
    shp = (3, 4)
-    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
+    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
+                           ('float16', 'int8'), ('float16', 'float64'),
+                           ('float16', 'float16')]:
        shared = gpuarray_shared_constructor
        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
        yval = np.empty((2,) + shp[1:], dtype=dtype2)
@@ -95,7 +124,9 @@ def test_advinc_subtensor1_dtype():
 def test_advinc_subtensor1_vector_scalar():
    # Test the case where x is a vector and y a scalar
    shp = (3,)
-    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
+    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
+                           ('float16', 'int8'), ('float16', 'float64'),
+                           ('float16', 'float16')]:
        shared = gpuarray_shared_constructor
        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
        yval = np.asarray(10, dtype=dtype2)
@@ -105,7 +136,8 @@ def test_advinc_subtensor1_vector_scalar():
                          name='y')
        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
        f = theano.function([y], expr, mode=mode_with_gpu)
-        assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1_dev20)
+        assert sum([isinstance(node.op, (GpuAdvancedIncSubtensor1_dev20,
+                                         GpuAdvancedIncSubtensor1))
                    for node in f.maker.fgraph.toposort()]) == 1
        rval = f(yval)
        rep = xval.copy()
@@ -169,7 +201,26 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
            sub=GpuAdvancedSubtensor,
            mode=mode_with_gpu,
            # avoid errors with limited devices
-            dtype='float32',
+            dtype='float32',  # floatX?
+            ignore_topo=(HostFromGpu, GpuFromHost,
+                         DeepCopyOp))
+        # GPU opt can't run in fast_compile only.
+        self.fast_compile = False
+        assert self.sub == GpuAdvancedSubtensor
+
+
+class G_advancedsubtensorF16(test_subtensor.TestAdvancedSubtensor):
+    def shortDescription(self):
+        return None
+
+    def __init__(self, name):
+        test_subtensor.TestAdvancedSubtensor.__init__(
+            self, name,
+            shared=gpuarray_shared_constructor,
+            sub=GpuAdvancedSubtensor,
+            mode=mode_with_gpu,
+            # avoid errors with limited devices
+            dtype='float16',  # floatX?
            ignore_topo=(HostFromGpu, GpuFromHost,
                         DeepCopyOp))
        # GPU opt can't run in fast_compile only.
@@ -218,6 +269,17 @@ class test_gpuextractdiag(unittest.TestCase):
                GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
                np_x.diagonal(offset, axis1, axis2))

+    def test_tensor_float16(self):
+        x = tensor.tensor4()
+        np_x = np.arange(30107).reshape(7, 11, 17, 23).astype('float16')
+        for offset, axis1, axis2 in [
+                (1, 0, 1), (-1, 0, 1), (0, 1, 0), (-2, 1, 0),
+                (-3, 1, 0), (-2, 2, 0), (3, 3, 0), (-1, 3, 2),
+                (2, 2, 3), (-1, 2, 1), (1, 3, 1), (-1, 1, 3)]:
+            assert np.allclose(
+                GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
+                np_x.diagonal(offset, axis1, axis2))
+

 class test_gpuallocdiag(unittest.TestCase):
    def test_matrix(self):

--- a/theano/gpuarray/tests/test_type.py
+++ b/theano/gpuarray/tests/test_type.py
@@ -7,7 +7,7 @@ import numpy as np
 import theano
 from theano.compat import PY3
 from theano import config
-from theano.compile import DeepCopyOp
+from theano.compile import DeepCopyOp, Rebroadcast, ViewOp
 from theano.misc.pkl_utils import CompatUnpickler

 # Disabled for now
@@ -21,16 +21,45 @@ import pygpu


 def test_deep_copy():
-    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
+    for dtype in ['float16', 'float32']:
+        a = rand_gpuarray(20, dtype=dtype)
+        g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
+
+        f = theano.function([g], g)
+
+        assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
+
+        res = f(a)
+
+        assert GpuArrayType.values_eq(res, a)
+
+
+def test_view():
+    for dtype in ['float16', 'float32']:
+        a = rand_gpuarray(20, dtype=dtype)
+        g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')

-    f = theano.function([g], g)
+        f = theano.function([g], ViewOp()(g))

-    assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
+        assert isinstance(f.maker.fgraph.toposort()[0].op, ViewOp)

-    res = f(a)
+        res = f(a)

-    assert GpuArrayType.values_eq(res, a)
+        assert GpuArrayType.values_eq(res, a)
+
+
+def test_rebroadcast():
+    for dtype in ['float16', 'float32']:
+        a = rand_gpuarray(1, dtype=dtype)
+        g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
+
+        f = theano.function([g], Rebroadcast((0, True))(g))
+
+        assert isinstance(f.maker.fgraph.toposort()[0].op, Rebroadcast)
+
+        res = f(a)
+
+        assert GpuArrayType.values_eq(res, a)


 def test_values_eq_approx():
@@ -45,10 +74,11 @@ def test_values_eq_approx():


 def test_specify_shape():
-    a = rand_gpuarray(20, dtype='float32')
-    g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
-    f = theano.function([g], theano.tensor.specify_shape(g, [20]))
-    f(a)
+    for dtype in ['float16', 'float32']:
+        a = rand_gpuarray(20, dtype=dtype)
+        g = GpuArrayType(dtype=dtype, broadcastable=(False,))('g')
+        f = theano.function([g], theano.tensor.specify_shape(g, [20]))
+        f(a)


 def test_filter_float():

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -1482,7 +1482,11 @@ class numeric_grad(object):
        The tuple (abs_err, rel_err) is returned
        """
        abs_err = abs(a - b)
-        rel_err = abs_err / np.maximum(abs(a) + abs(b), 1e-8)
+        # 1e-8 is to prevent division by zeros.
+        # [] is to make sure that if a and b are float16, 1e-8 don't get
+        # dowcasted to float16 as that give 0! This would add back the
+        # division by zero
+        rel_err = abs_err / np.maximum(abs(a) + abs(b), [1e-8])
        # The numpy.asarray are needed as if a or b is a sparse matrix
        # this would result in a numpy.matrix and not a numpy.ndarray
        # and the behave differently causing problem later.

--- a/theano/tensor/inc_code.py
+++ b/theano/tensor/inc_code.py
+from __future__ import absolute_import, print_function, division
+
+
+def inc_code():
+    types = ['npy_' + t for t in ['int8', 'int16', 'int32', 'int64',
+                                  'uint8', 'uint16', 'uint32', 'uint64',
+                                  'float16', 'float32', 'float64']]
+
+    complex_types = ['npy_' + t for t in ['complex32', 'complex64',
+                                          'complex128']]
+
+    inplace_map_template = """
+    #if defined(%(typen)s)
+    static void %(type)s_inplace_add(PyArrayMapIterObject *mit,
+                                     PyArrayIterObject *it, int inc_or_set)
+    {
+        int index = mit->size;
+        while (index--) {
+            %(op)s
+
+            PyArray_MapIterNext(mit);
+            PyArray_ITER_NEXT(it);
+        }
+    }
+    #endif
+    """
+
+    floatadd = ("((%(type)s*)mit->dataptr)[0] = "
+                "(inc_or_set ? ((%(type)s*)mit->dataptr)[0] : 0)"
+                " + ((%(type)s*)it->dataptr)[0];")
+    complexadd = """
+    ((%(type)s*)mit->dataptr)[0].real =
+        (inc_or_set ? ((%(type)s*)mit->dataptr)[0].real : 0)
+        + ((%(type)s*)it->dataptr)[0].real;
+    ((%(type)s*)mit->dataptr)[0].imag =
+        (inc_or_set ? ((%(type)s*)mit->dataptr)[0].imag : 0)
+        + ((%(type)s*)it->dataptr)[0].imag;
+    """
+
+    fns = ''.join([inplace_map_template % {'type': t, 'typen': t.upper(),
+                                           'op': floatadd % {'type': t}}
+                   for t in types] +
+                  [inplace_map_template % {'type': t, 'typen': t.upper(),
+                                           'op': complexadd % {'type': t}}
+                   for t in complex_types])
+
+    def gen_binop(type, typen):
+        return """
+#if defined(%(typen)s)
+%(type)s_inplace_add,
+#endif
+""" % dict(type=type, typen=typen)
+
+    fn_array = ("static inplace_map_binop addition_funcs[] = {" +
+                ''.join([gen_binop(type=t, typen=t.upper())
+                         for t in types + complex_types]) + "NULL};\n")
+
+    def gen_num(typen):
+        return """
+#if defined(%(typen)s)
+%(typen)s,
+#endif
+""" % dict(type=type, typen=typen)
+
+    type_number_array = ("static int type_numbers[] = {" +
+                         ''.join([gen_num(typen=t.upper())
+                                  for t in types + complex_types]) + "-1000};")
+
+    code = ("""
+        typedef void (*inplace_map_binop)(PyArrayMapIterObject *,
+                                          PyArrayIterObject *, int inc_or_set);
+        """ + fns + fn_array + type_number_array + """
+static int
+map_increment(PyArrayMapIterObject *mit, PyArrayObject *op,
+              inplace_map_binop add_inplace, int inc_or_set)
+{
+    PyArrayObject *arr = NULL;
+    PyArrayIterObject *it;
+    PyArray_Descr *descr;
+    if (mit->ait == NULL) {
+        return -1;
+    }
+    descr = PyArray_DESCR(mit->ait->ao);
+    Py_INCREF(descr);
+    arr = (PyArrayObject *)PyArray_FromAny((PyObject *)op, descr,
+                                0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        return -1;
+    }
+    if ((mit->subspace != NULL) && (mit->consec)) {
+        PyArray_MapIterSwapAxes(mit, (PyArrayObject **)&arr, 0);
+        if (arr == NULL) {
+            return -1;
+        }
+    }
+    it = (PyArrayIterObject*)
+            PyArray_BroadcastToShape((PyObject*)arr, mit->dimensions, mit->nd);
+    if (it  == NULL) {
+        Py_DECREF(arr);
+        return -1;
+    }
+
+    (*add_inplace)(mit, it, inc_or_set);
+
+    Py_DECREF(arr);
+    Py_DECREF(it);
+    return 0;
+}
+
+
+static int
+inplace_increment(PyArrayObject *a, PyObject *index, PyArrayObject *inc,
+                  int inc_or_set)
+{
+    inplace_map_binop add_inplace = NULL;
+    int type_number = -1;
+    int i = 0;
+    PyArrayMapIterObject * mit;
+
+    if (PyArray_FailUnlessWriteable(a, "input/output array") < 0) {
+        return -1;
+    }
+
+    if (PyArray_NDIM(a) == 0) {
+        PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed.");
+        return -1;
+    }
+    type_number = PyArray_TYPE(a);
+
+    while (type_numbers[i] >= 0 && addition_funcs[i] != NULL){
+        if (type_number == type_numbers[i]) {
+            add_inplace = addition_funcs[i];
+            break;
+        }
+        i++ ;
+    }
+
+    if (add_inplace == NULL) {
+        PyErr_SetString(PyExc_TypeError, "unsupported type for a");
+        return -1;
+    }
+    mit = (PyArrayMapIterObject *) PyArray_MapIterArray(a, index);
+    if (mit == NULL) {
+        goto fail;
+    }
+    if (map_increment(mit, inc, add_inplace, inc_or_set) != 0) {
+        goto fail;
+    }
+
+    Py_DECREF(mit);
+
+    Py_INCREF(Py_None);
+    return 0;
+
+fail:
+    Py_XDECREF(mit);
+
+    return -1;
+}
+""")
+
+    return code
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -22,9 +22,7 @@ from theano.tensor.elemwise import DimShuffle
 from theano.tensor.type_other import NoneConst, SliceType, NoneTypeT, make_slice
 from theano import config

-if config.cxx:
-    import theano.gof.cutils  # needed to import cutils_ext
-    from cutils_ext.cutils_ext import inplace_increment
+from .inc_code import inc_code

 _logger = logging.getLogger("theano.tensor.subtensor")

@@ -1943,8 +1941,7 @@ class AdvancedIncSubtensor1(Op):
                NPY_ARRAY_ENSURECOPY, NULL)""" % locals()

    def c_support_code(self):
-        from theano.gof.cutils import compile_cutils_code
-        return compile_cutils_code()
+        return inc_code()

    def c_code(self, node, name, input_names, output_names, sub):
        numpy_ver = [int(n) for n in np.__version__.split('.')[:2]]
@@ -1976,17 +1973,14 @@ class AdvancedIncSubtensor1(Op):
            Py_XDECREF(%(out)s);
            %(out)s = %(copy_of_x)s;
        }
-        PyObject *arglist = Py_BuildValue("OOOi",%(out)s, %(idx)s, %(y)s, %(inc_or_set)d);
-        rval = inplace_increment(NULL, arglist);
-        Py_XDECREF(arglist);
-        if (rval == NULL) {
+        if (inplace_increment(%(out)s, (PyObject *)%(idx)s, %(y)s, %(inc_or_set)d)) {
            %(fail)s;
        }
        Py_XDECREF(rval);
        """ % locals()

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

    def perform(self, node, inp, out_):
        # TODO opt to make this inplace
@@ -2001,36 +1995,10 @@ class AdvancedIncSubtensor1(Op):
        if self.set_instead_of_inc:
            x[idx] = y
        else:
-            if config.cxx and node.inputs[0].dtype != 'float16':
-                increment = inplace_increment
-            else:
-                increment = self.inplace_increment1d_slow
-
-            increment(x, idx, y)
+            np.add.at(x, idx, y)

        out[0] = x

-    def inplace_increment1d_slow(self, x, idx, y):
-        # If `y` has as many dimensions as `x`, then we want to iterate
-        # jointly on `x` and `y`. Otherwise, it means `y` should be
-        # broadcasted to fill all relevant rows of `x`.
-        assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
-        if y.ndim == x.ndim:
-            if len(y) == 1:
-                # Allow broadcasting of y[0]
-                y_0 = y[0]
-                for i in idx:
-                    x[i] += y_0
-            else:
-                assert len(y) == len(idx)
-                j = 0
-                for i in idx:
-                    x[i] += y[j]
-                    j += 1
-        else:
-            for i in idx:
-                x[i] += y
-
    def infer_shape(self, node, ishapes):
        x, y, ilist = ishapes
        return [x]
@@ -2246,14 +2214,8 @@ class AdvancedIncSubtensor(Op):

        if self.set_instead_of_inc:
            out[0][inputs[2:]] = inputs[1]
-        elif config.cxx:
-            inplace_increment(out[0], tuple(inputs[2:]), inputs[1])
        else:
-            raise NotImplementedError(
-                'Could not import inplace_increment, so advanced '
-                'indexing is disabled. '
-                'Please make sure that you have a working C++ compiler '
-                'and that config.cxx is correctly set.')
+            np.add.at(out[0], tuple(inputs[2:]), inputs[1])

    def infer_shape(self, node, ishapes):
        return [ishapes[0]]

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2936,10 +2936,7 @@ def test_local_IncSubtensor_serialize():
    i = T.vector('i', dtype='int64')
    j = T.vector('j', dtype='int64')
    t = T.scalar('t')
-    if theano.tensor.subtensor.inplace_increment:
-        y = (W[i] + W[j] + W[1] + W[i, j]).sum()
-    else:
-        y = (W[i] + W[j] + W[1]).sum()
+    y = (W[i] + W[j] + W[1] + W[i, j]).sum()
    cost = T.sqr(t - y)
    dW = theano.grad(cost, W)
    mode = theano.compile.mode.get_default_mode().excluding('fusion')

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -5,7 +5,6 @@ import sys
 import unittest

 import numpy as np
-from nose.plugins.skip import SkipTest
 from nose.tools import assert_equal
 from numpy.testing import assert_array_equal
 from six import StringIO
@@ -524,10 +523,11 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            gn = theano.grad(t.sum(), n)
            g = self.function([], gn, op=self.adv_incsub1)
            utt.verify_grad(lambda m: m[[1, 3]],
-                            [np.random.rand(5, 5).astype(self.dtype)])
+                            [np.random.rand(5, 5).astype(self.dtype)],
+                            mode=self.mode)
            g()
            utt.verify_grad(lambda m: m[idx],
-                            [data])
+                            [data], mode=self.mode)

    def test_noncontiguous_idx(self):
        data = rand(4, 2, 3)
@@ -597,17 +597,20 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(np.allclose(g_00, 2))

        utt.verify_grad(lambda m: m[[1, 3]],
-                        [np.random.rand(5, 5).astype(self.dtype)])
+                        [np.random.rand(5, 5).astype(self.dtype)],
+                        mode=self.mode)

        def fun(x, y):
            return advanced_inc_subtensor1(x, y, [1, 3])
        utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
-                              np.random.rand(2, 5).astype(self.dtype)])
+                              np.random.rand(2, 5).astype(self.dtype)],
+                        mode=self.mode)

        def fun(x, y):
            return advanced_set_subtensor1(x, y, [1, 3])
        utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
-                              np.random.rand(2, 5).astype(self.dtype)])
+                              np.random.rand(2, 5).astype(self.dtype)],
+                        mode=self.mode)

        # test set_subtensor broadcast
        self.dtype = 'float32'
@@ -872,12 +875,12 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):

            def fct(t):
                return theano.tensor.sum(t[idx_])
-            utt.verify_grad(fct, [data])
+            utt.verify_grad(fct, [data], mode=self.mode)

            # Test the grad of the grad (e.i. AdvancedIncSubtensor1.grad)
            def fct2(t):
                return theano.tensor.grad(theano.tensor.sum(t[idx_]), t)
-            utt.verify_grad(fct2, [data])
+            utt.verify_grad(fct2, [data], mode=self.mode)

            # Test shape of AdvancedIncSubtensor1 and AdvancedSubtensor1
            if not self.fast_compile:
@@ -958,7 +961,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        # vector
        utt.verify_grad(
            inc_slice(slice(2, 4, None)),
-            (np.asarray([0, 1, 2, 3, 4, 5.]), np.asarray([9, 9.]),))
+            (np.asarray([0, 1, 2, 3, 4, 5.]), np.asarray([9, 9.]),),
+            mode=self.mode)

        # matrix
        utt.verify_grad(
@@ -1498,9 +1502,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
        utt.assert_allclose(rval, aval)

    def test_inc_adv_subtensor_w_2vec(self):
-        if not config.cxx:
-            raise SkipTest('config.cxx empty')
-
        subt = self.m[self.ix1, self.ix12]
        a = inc_subtensor(subt, subt)

@@ -1519,9 +1520,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
                            [.5, .3 * 2, .15]]), aval

    def test_inc_adv_subtensor_with_broadcasting(self):
-        if not config.cxx:
-            raise SkipTest('config.cxx empty')
-
        inc = dscalar()
        a = inc_subtensor(self.m[self.ix1, self.ix12], inc)
        g_inc = tensor.grad(a.sum(), inc)
@@ -1542,9 +1540,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
        assert np.allclose(gval, 3.0), gval

    def test_inc_adv_subtensor1_with_broadcasting(self):
-        if not config.cxx:
-            raise SkipTest('config.cxx empty')
-
        inc = dscalar()
        a = inc_subtensor(self.m[self.ix1], inc)
        g_inc = tensor.grad(a.sum(), inc)
@@ -1564,9 +1559,6 @@ class TestAdvancedSubtensor(unittest.TestCase):
        assert np.allclose(gval, 9.0), gval

    def test_inc_adv_subtensor_with_index_broadcasting(self):
-        if not config.cxx:
-            raise SkipTest('config.cxx empty')
-
        a = inc_subtensor(self.m[self.ix1, self.ix2], 2.1)

        assert a.type == self.m.type, (a.type, self.m.type)
@@ -1640,17 +1632,20 @@ class TestAdvancedSubtensor(unittest.TestCase):
        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor))

        utt.verify_grad(lambda m: m[[1, 3], [2, 4]],
-                        [np.random.rand(5, 5).astype(self.dtype)])
+                        [np.random.rand(5, 5).astype(self.dtype)],
+                        mode=self.mode)

        def fun(x, y):
            return advanced_inc_subtensor(x, y, [1, 3], [2, 4])
        utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
-                              np.random.rand(2).astype(self.dtype)])
+                              np.random.rand(2).astype(self.dtype)],
+                        mode=self.mode)

        def fun(x, y):
            return advanced_set_subtensor(x, y, [1, 3], [2, 4])
        utt.verify_grad(fun, [np.random.rand(5, 5).astype(self.dtype),
-                              np.random.rand(2).astype(self.dtype)])
+                              np.random.rand(2).astype(self.dtype)],
+                        mode=self.mode)


 class TestInferShape(utt.InferShapeTester):