提交 19540d4e authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5632 from abergeron/no_warpsync

Remove warp-synchronous programming.
......@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32)
{
Z[0] = -666;
return;
}
""" % locals()
def _assign_init(self, first_item):
......@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
write_out = write_w(node.outputs[0].dtype)
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
# The code here works with all nvidia driver
# But only for powers or multiples of 2!
new_version = """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version += self._assign_reduce(node, name, 'buf[idx]',
'buf[threadNum]', sub, False)
new_version += """
}
__syncthreads();
// Works for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
%(acc_dtype)s temp = buf[threadNum + halfPoint];
"""
new_version += self._assign_reduce(node, name,
'buf[threadNum]', 'temp', sub, False)
new_version += """
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if (threadNum == 0)
{
%(z_pos)s = %(write_out)s(buf[0]);
}
__syncthreads();"""
new_version = new_version % locals()
current_version = """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
if (threadNum < warpSize) {
//round up all the partial sums into the first `warpSize` elements
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
......@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sub, False) + """
}
buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU.
TODO: find why it don't work or put the GPU compute capability into the version
// no sync because only one warp is running
if(threadCount >32)
{"""
for num in [16, 8, 4, 2, 1]:
current_version += self._assign_reduce(node, name,
'buf[threadNum]',
'buf[threadNum+%d]' % num,
sub, False)
current_version += """
}
__syncthreads();
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (threadNum < _n && threadNum + _n < threadCount)
"""
current_version += """
if (threadNum == 0)
{
%(z_pos)s = %(write_out)s(buf[0]);
}
current_version += self._assign_reduce(node, name, 'buf[threadNum]',
'buf[threadNum+_n]', sub, False)
}
else */
if (threadNum < 16)
{
//reduce so that threadNum 0 has the reduction of everything
"""
for num in [16, 8, 4, 2, 1]:
this_if = "if (threadNum + %d < threadCount) " % num + \
self._assign_reduce(node, name,
'buf[threadNum]', 'buf[threadNum+%d]' % num,
sub, False)
current_version += this_if
current_version += """
"""
current_version += """
if (threadNum == 0)
{
%(z_pos)s = %(write_out)s(buf[0]);
}
}
__syncthreads();
}
if (threadNum == 0) {
%(z_pos)s = %(write_out)s(buf[0]);
}
"""
......@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_cache_version_apply(self, node):
version = [18] # the version corresponding to the c code in this Op
version = [19] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(
......@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)s
......@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
%(reduce_fct)s
......@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
......@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
......@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int a = blockIdx.x; a < A; a += gridDim.x)
{
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
......@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print("""
%(decl)s
{
if(warpSize<blockDim.x){
//TODO: set error code
Z[0] = -666;
return;
}
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
......@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
//TODO: set error code
Z[blockIdx.x * sZ0] = %(write_out)s(-666);
return;
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
......@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
......@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
......
......@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
if (index < blockDim.x*2) {
partialCumOp[index] %(op)s= partialCumOp[index - stride];
}
}
......@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
if (index + stride < blockDim.x*2) {
partialCumOp[index + stride] %(op)s= partialCumOp[index];
}
}
......
......@@ -57,7 +57,7 @@ def code_version(version):
UNVERSIONED = ()
@code_version((1,))
@code_version((2,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""
Return C++ code for a function that reduces a contiguous buffer.
......@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn):
"""
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))
return """
{
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
if (%(pos)s < warpSize)
{
if (%(pos)s < warpSize) {
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the sum of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
__syncthreads();
//reduce so that %(pos)s 0 has the reduction of everything
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (%(pos)s < _n && %(pos)s + _n < %(N)s)
%(buf)s[%(pos)s] = %(r_n)s;
__syncthreads();
}
}
""" % locals()
......@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
]
@code_version((2,))
@code_version((3,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init,
b='', stride_b='', load_b='', dtype='float32'):
......@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and
returns c code for their reduction.
......@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_init
A function that accepts strings of arguments a and return c
code for its initialization.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
Notes
-----
......@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype)
return """
......@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
// leaving the reduction result in buf[0].
%(ctype)s red = %(init)s;
#pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) {
red = %(loop_line)s;
}
buf[%(pos)s] = red;
__syncthreads();
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
{
if (%(pos)s < warpSize) {
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) {
%(buf)s[%(pos)s] = %(loop_line2)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the reduction of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
__syncthreads();
//reduce so that %(pos)s 0 has the reduction of everything
for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if (%(pos)s < _n && %(pos)s + _n < %(N)s)
%(buf)s[%(pos)s] = %(r_n)s;
__syncthreads();
}
}
""" % locals()
......
......@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor):
# This is also in consistence with the interface of numpy.diagonal.
if slice_axis < stride_axis:
stride_axis -= 1
new_dim_order = range(x[slicer].ndim)
new_dim_order = list(range(x[slicer].ndim))
new_dim_order = tuple(new_dim_order[:stride_axis] +
new_dim_order[stride_axis + 1:] +
[stride_axis, ])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论