Merge pull request #4556 from abergeron/faster_incsub

Don't rebuild inplace add kernels all the time for GpuIncSubtensor.

Merge pull request #4556 from abergeron/faster_incsub
59a5dfbb · Frédéric Bastien · GitHub · a1abed83 · 50b1bbef · 59a5dfbb
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -50,6 +50,15 @@ def init_dev(dev, name=None):
    if v[1] < 0:
        raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
                           "Please update libgpuarray/pygpu.")
+    if len(v) < 3:
+        vpy = -1
+    else:
+        vpy = v[2]
+    vpye = 0
+    if vpy < vpye:
+        print("Wrong python API version for gpuarray:", vpy, "expected:", vpye,
+              "Some python ops may not work correctly and/or crash. "
+              "Consider updating pygpu.", file=sys.stderr)
    global pygpu_activated
    if dev not in init_dev.devmap:
        ctx = pygpu.init(dev,

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -56,3 +56,32 @@ def test_advinc_subtensor1():
        rep = xval.copy()
        rep[[0, 2]] += yval
        assert numpy.allclose(rval, rep)
+def test_incsub_f16():
+    shp = (3, 3)
+    shared = gpuarray_shared_constructor
+    xval = numpy.arange(numpy.prod(shp), dtype='float16').reshape(shp) + 1
+    yval = numpy.empty((2,) + shp[1:], dtype='float16')
+    yval[:] = 2
+    x = shared(xval, name='x')
+    y = tensor.tensor(dtype='float16',
+                      broadcastable=(False,) * len(shp),
+                      name='y')
+    expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
+    f = theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
+                for node in f.maker.fgraph.toposort()]) == 1
+    rval = f(yval)
+    rep = xval.copy()
+    rep[[0, 2]] += yval
+    assert numpy.allclose(rval, rep)
+    expr = tensor.inc_subtensor(x[1:], y)
+    f = theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, GpuIncSubtensor)
+                for node in f.maker.fgraph.toposort()]) == 1
+    rval = f(yval)
+    rep = xval.copy()
+    rep[1:] += yval
+    assert numpy.allclose(rval, rep)
--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -301,20 +301,14 @@ class GpuArrayType(Type):
                raise NotImplementedError(
                    "GpuArrayType.values_eq_approx() don't implemented the"
                    " allow_remove_inf and allow_remove_nan parameter")
-            if a.dtype == 'float16' or b.dtype == 'float16':
-                an = numpy.asarray(a)
-                bn = numpy.asarray(b)
-                return tensor.TensorType.values_eq_approx(
-                    an, bn, allow_remove_inf=allow_remove_inf,
-                    allow_remove_nan=allow_remove_nan, rtol=rtol, atol=atol)
            atol_, rtol_ = theano.tensor.basic._get_atol_rtol(a, b)
            if rtol is not None:
                rtol_ = rtol
            if atol is not None:
                atol_ = atol
            res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
-                            op_tmpl="res[i] = (fabs(%%(a)s - %%(b)s) <"
+                            op_tmpl="res = (fabs(a - b) <"
-                            "(%(atol_)s + %(rtol_)s * fabs(%%(b)s)))" %
+                            "(%(atol_)s + %(rtol_)s * fabs(b)))" %
                            locals())
            ret = numpy.asarray(res).all()
            if ret: