Merge pull request #5632 from abergeron/no_warpsync

Remove warp-synchronous programming.

Merge pull request #5632 from abergeron/no_warpsync
19540d4e · Frédéric Bastien · GitHub · 76cfd7e8 · bb0a326c · 19540d4e
--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                %(acc_type)s myresult = 0;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
-
-                //This is caught in cuda/init.py when we init the gpu. I keep
-                //it here to ease finding code that rely on this.
-                if (warpSize != 32)
-                {
-                    Z[0] = -666;
-                    return;
-                }
-
        """ % locals()

    def _assign_init(self, first_item):
@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
        write_out = write_w(node.outputs[0].dtype)

-        # This code (the code in new_version) is currently ignored.
-        # Code produced later in this function is returned instead.
-        # The code here works with all nvidia driver
-        # But only for powers or multiples of 2!
-        new_version = """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = myresult;
-        __syncthreads();
-
-
-        if (threadNum >= ((threadCount >> 1) * 2))
-        {
-            int idx = threadNum - (threadCount >> 1) * 2;"""
-
-        new_version += self._assign_reduce(node, name, 'buf[idx]',
-                                           'buf[threadNum]', sub, False)
-
-        new_version += """
-        }
-        __syncthreads();
-
-        // Works for power of 2 only.
-        int nTotalThreads = threadCount; // Total number of active threads
-        while(nTotalThreads > 1)
-        {
-            int halfPoint = (nTotalThreads >> 1);        // divide by two
-            // only the first half of the threads will be active.
-
-            if (threadNum < halfPoint)
-            {
-              // Get the shared value stored by another thread
-              %(acc_dtype)s temp = buf[threadNum + halfPoint];
-              """
-
-        new_version += self._assign_reduce(node, name,
-                                           'buf[threadNum]', 'temp', sub, False)
-
-        new_version += """
-            }
-            __syncthreads();
-
-            nTotalThreads = (nTotalThreads >> 1);        // divide by two.
-        }
-            __syncthreads();
-
-        if (threadNum == 0)
-        {
-            %(z_pos)s = %(write_out)s(buf[0]);
-        }
-            __syncthreads();"""
-
-        new_version = new_version % locals()
-
        current_version = """
        __syncthreads(); // some kernel do multiple reduction.
        buf[threadNum] = myresult;
        __syncthreads();

        // rest of function is handled by one warp
-        if (threadNum < warpSize)
-        {
+        if (threadNum < warpSize) {
            //round up all the partial sums into the first `warpSize` elements
            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
            {
@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                               sub, False) + """
            }
            buf[threadNum] = myresult;
-        /*Comment this optimization as it don't work on Fermi GPU.
-        TODO: find why it don't work or put the GPU compute capability into the version
-            // no sync because only one warp is running
-            if(threadCount >32)
-            {"""
-        for num in [16, 8, 4, 2, 1]:
-            current_version += self._assign_reduce(node, name,
-                                                   'buf[threadNum]',
-                                                   'buf[threadNum+%d]' % num,
-                                                   sub, False)
-            current_version += """
+        }
+        __syncthreads();
+        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
+            if (threadNum < _n && threadNum + _n < threadCount)
            """
-        current_version += """
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = %(write_out)s(buf[0]);
-                }
+        current_version += self._assign_reduce(node, name, 'buf[threadNum]',
+                                               'buf[threadNum+_n]', sub, False)

-            }
-            else */
-            if (threadNum < 16)
-            {
-                //reduce so that threadNum 0 has the reduction of everything
-                """
-        for num in [16, 8, 4, 2, 1]:
-            this_if = "if (threadNum + %d < threadCount) " % num + \
-                self._assign_reduce(node, name,
-                                    'buf[threadNum]', 'buf[threadNum+%d]' % num,
-                                    sub, False)
-            current_version += this_if
-            current_version += """
-            """
        current_version += """
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = %(write_out)s(buf[0]);
-                }
-            }
+            __syncthreads();
+        }
+        if (threadNum == 0) {
+          %(z_pos)s = %(write_out)s(buf[0]);
        }
        """

@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [18]  # the version corresponding to the c code in this Op
+        version = [19]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(
@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                {
                    %(reduce_fct)s
@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                {
                    %(reduce_fct)s
@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
                {
                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-
                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                X = (const %(in_type)s *)(((char *)X)+offset_X);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
                for (int a = blockIdx.x; a < A; a += gridDim.x)
                {
                    for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            print("""
            %(decl)s
            {
-             if(warpSize<blockDim.x){
-               //TODO: set error code
-               Z[0] = -666;
-               return;
-              }
-
              %(init)s
              for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
              {
@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    //TODO: set error code
-                    Z[blockIdx.x * sZ0] = %(write_out)s(-666);
-                    return;
-                }
-
                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
                {
                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
                {
                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op):
            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if(index < blockDim.x*2) {
+                if (index < blockDim.x*2) {
                    partialCumOp[index] %(op)s= partialCumOp[index - stride];
                }
            }
@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op):
            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if(index + stride < blockDim.x*2) {
+                if (index + stride < blockDim.x*2) {
                    partialCumOp[index + stride] %(op)s= partialCumOp[index];
                }
            }

--- a/theano/gpuarray/kernel_codegen.py
+++ b/theano/gpuarray/kernel_codegen.py
@@ -57,7 +57,7 @@ def code_version(version):
 UNVERSIONED = ()


-@code_version((1,))
+@code_version((2,))
 def inline_reduce(N, buf, pos, count, manner_fn):
    """
    Return C++ code for a function that reduces a contiguous buffer.
@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn):

    """
    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
-    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
-    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
-    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
-    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
-    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))

    return """
    {
        // This function trashes buf[1..warpSize],
        // leaving the reduction result in buf[0].

-        if (%(pos)s < warpSize)
-        {
+        if (%(pos)s < warpSize) {
            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
            {
                %(buf)s[%(pos)s] = %(loop_line)s;
            }
-            if (%(pos)s < 16)
-            {
-                //reduce so that %(pos)s 0 has the sum of everything
-                if(%(pos)s + 16 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_16)s;
-                if(%(pos)s + 8 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_8)s;
-                if(%(pos)s + 4 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_4)s;
-                if(%(pos)s + 2 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_2)s;
-                if(%(pos)s + 1 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_1)s;
-            }
+        }
+        __syncthreads();
+        //reduce so that %(pos)s 0 has the reduction of everything
+        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
+          if (%(pos)s < _n && %(pos)s + _n < %(N)s)
+            %(buf)s[%(pos)s] = %(r_n)s;
+          __syncthreads();
        }
    }
    """ % locals()
@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            ]


-@code_version((2,))
+@code_version((3,))
 def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b='', load_b='', dtype='float32'):
@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
        Index of executing thread.
    count
        Number of executing threads.
-    b
-        Optional, pointer to the bias.
-    stride_b
-        Optional, the stride of b if b is provided.
-    load_b
-        Optional, wrapper to read from b if b is provided.
-    dtype
-        Optional, the dtype of the output.
    manner_fn
        A function that accepts strings of arguments a and b, and
        returns c code for their reduction.
@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
    manner_init
        A function that accepts strings of arguments a and return c
        code for its initialization.
+    b
+        Optional, pointer to the bias.
+    stride_b
+        Optional, the stride of b if b is provided.
+    load_b
+        Optional, wrapper to read from b if b is provided.
+    dtype
+        Optional, the dtype of the output.

    Notes
    -----
@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                                                 locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
                           "%s[i]" % buf)
-    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
-    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
-    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
-    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
-    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))

    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
        // leaving the reduction result in buf[0].
        %(ctype)s red = %(init)s;
        #pragma unroll 16
-        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
+        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) {
          red = %(loop_line)s;
        }
        buf[%(pos)s] = red;
        __syncthreads();
-        if (%(pos)s < warpSize)
-        {
-            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
-            {
+        if (%(pos)s < warpSize) {
+            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) {
                %(buf)s[%(pos)s] = %(loop_line2)s;
            }
-            if (%(pos)s < 16)
-            {
-                //reduce so that %(pos)s 0 has the reduction of everything
-                if(%(pos)s + 16 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_16)s;
-                if(%(pos)s + 8 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_8)s;
-                if(%(pos)s + 4 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_4)s;
-                if(%(pos)s + 2 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_2)s;
-                if(%(pos)s + 1 < %(N)s)
-                    %(buf)s[%(pos)s] = %(r_1)s;
-            }
+        }
+        __syncthreads();
+        //reduce so that %(pos)s 0 has the reduction of everything
+        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
+          if (%(pos)s < _n && %(pos)s + _n < %(N)s)
+            %(buf)s[%(pos)s] = %(r_n)s;
+          __syncthreads();
        }
    }
    """ % locals()

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor):
        # This is also in consistence with the interface of numpy.diagonal.
        if slice_axis < stride_axis:
            stride_axis -= 1
-        new_dim_order = range(x[slicer].ndim)
+        new_dim_order = list(range(x[slicer].ndim))
        new_dim_order = tuple(new_dim_order[:stride_axis] +
                              new_dim_order[stride_axis + 1:] +
                              [stride_axis, ])