Merge pull request #5747 from abergeron/long_incsub

Add support for atomic{Exch,Add} on long longs.

Merge pull request #5747 from abergeron/long_incsub
568af179 · Frédéric Bastien · GitHub · 69bcff08 · 88f6d2db · 568af179
--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -871,7 +871,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
    def c_code_cache_version(self):
-        return (13,)
+        return (14,)
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
@@ -924,9 +924,8 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
 */
 #if __CUDA_ARCH__ < 600
 __device__ ga_double atomicAdd(ga_double* address, ga_double val) {
-    unsigned long long int* address_as_ull =
+    ga_ulong *address_as_ull = (ga_ulong *)address;
-                                          (unsigned long long int*)address;
+    ga_ulong old = *address_as_ull, assumed;
-    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
@@ -938,17 +937,35 @@ __device__ ga_double atomicAdd(ga_double* address, ga_double val) {
 #endif
 __device__ ga_double atomicExch(ga_double *address, ga_double val) {
-    return atomicExch((unsigned long long int *)address,
+    return atomicExch((ga_ulong *)address,
                      __double_as_longlong(val));
 }
+/* GA_LONG */
+__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
+    ga_ulong *address_as_ull = (ga_ulong *)address;
+    ga_ulong old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        (ga_ulong)(val + (ga_long)assumed));
+    } while (assumed != old);
+    return (ga_long)old;
+}
+__device__ ga_long atomicExch(ga_long *address, ga_long val) {
+    return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
+}
+/* GA_HALF */
 /*
- * This is a version of atomicAdd that works for half-floats.  It may
+ * This may read and write 2 bytes more than the size of the array
- * read and write 2 bytes more than the size of the array if the array
+ * if the array has an uneven number of elements.  The actual value
- * has an uneven number of elements.  The actual value at that spot
+ * at that spot will not be modified.
- * will not be modified.
 */
 __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, sum, new_;

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -93,7 +93,7 @@ def test_advinc_subtensor1():
                    for node in f.maker.fgraph.toposort()]) == 1
        rval = f(yval)
        rep = xval.copy()
-        rep[[0, 2]] += yval
+        np.add.at(rep, [0, 2], yval)
        assert np.allclose(rval, rep)
@@ -101,6 +101,7 @@ def test_advinc_subtensor1_dtype():
    # Test the mixed dtype case
    shp = (3, 4)
    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64'),
+                           ('uint64', 'int8'), ('int64', 'uint8'),
                           ('float16', 'int8'), ('float16', 'float64'),
                           ('float16', 'float16')]:
        shared = gpuarray_shared_constructor
@@ -117,7 +118,7 @@ def test_advinc_subtensor1_dtype():
                    for node in f.maker.fgraph.toposort()]) == 1
        rval = f(yval)
        rep = xval.copy()
-        rep[[0, 2]] += yval
+        np.add.at(rep, [[0, 2]], yval)
        assert np.allclose(rval, rep)
@@ -139,7 +140,7 @@ def test_deterministic_flag():
                    for node in f.maker.fgraph.toposort()]) == 1
        rval = f(yval)
        rep = xval.copy()
-        rep[[0, 2]] += yval
+        np.add.at(rep, [[0, 2]], yval)
        assert np.allclose(rval, rep)
@@ -183,7 +184,7 @@ def test_incsub_f16():
                for node in f.maker.fgraph.toposort()]) == 1
    rval = f(yval)
    rep = xval.copy()
-    rep[[0, 2]] += yval
+    np.add.at(rep, [[0, 2]], yval)
    assert np.allclose(rval, rep)
    expr = tensor.inc_subtensor(x[1:], y)