提交 19540d4e authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5632 from abergeron/no_warpsync

Remove warp-synchronous programming.
...@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1005,15 +1005,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)s myresult = 0; %(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32)
{
Z[0] = -666;
return;
}
""" % locals() """ % locals()
def _assign_init(self, first_item): def _assign_init(self, first_item):
...@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1117,67 +1108,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
write_out = write_w(node.outputs[0].dtype) write_out = write_w(node.outputs[0].dtype)
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
# The code here works with all nvidia driver
# But only for powers or multiples of 2!
new_version = """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult;
__syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version += self._assign_reduce(node, name, 'buf[idx]',
'buf[threadNum]', sub, False)
new_version += """
}
__syncthreads();
// Works for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
%(acc_dtype)s temp = buf[threadNum + halfPoint];
"""
new_version += self._assign_reduce(node, name,
'buf[threadNum]', 'temp', sub, False)
new_version += """
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if (threadNum == 0)
{
%(z_pos)s = %(write_out)s(buf[0]);
}
__syncthreads();"""
new_version = new_version % locals()
current_version = """ current_version = """
__syncthreads(); // some kernel do multiple reduction. __syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult; buf[threadNum] = myresult;
__syncthreads(); __syncthreads();
// rest of function is handled by one warp // rest of function is handled by one warp
if (threadNum < warpSize) if (threadNum < warpSize) {
{
//round up all the partial sums into the first `warpSize` elements //round up all the partial sums into the first `warpSize` elements
for (int i = threadNum + warpSize; i < threadCount; i += warpSize) for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{ {
...@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1187,44 +1124,19 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sub, False) + """ sub, False) + """
} }
buf[threadNum] = myresult; buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU. }
TODO: find why it don't work or put the GPU compute capability into the version __syncthreads();
// no sync because only one warp is running for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
if(threadCount >32) if (threadNum < _n && threadNum + _n < threadCount)
{"""
for num in [16, 8, 4, 2, 1]:
current_version += self._assign_reduce(node, name,
'buf[threadNum]',
'buf[threadNum+%d]' % num,
sub, False)
current_version += """
""" """
current_version += """ current_version += self._assign_reduce(node, name, 'buf[threadNum]',
if (threadNum == 0) 'buf[threadNum+_n]', sub, False)
{
%(z_pos)s = %(write_out)s(buf[0]);
}
}
else */
if (threadNum < 16)
{
//reduce so that threadNum 0 has the reduction of everything
"""
for num in [16, 8, 4, 2, 1]:
this_if = "if (threadNum + %d < threadCount) " % num + \
self._assign_reduce(node, name,
'buf[threadNum]', 'buf[threadNum+%d]' % num,
sub, False)
current_version += this_if
current_version += """
"""
current_version += """ current_version += """
if (threadNum == 0) __syncthreads();
{ }
%(z_pos)s = %(write_out)s(buf[0]); if (threadNum == 0) {
} %(z_pos)s = %(write_out)s(buf[0]);
}
} }
""" """
...@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1900,7 +1812,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [18] # the version corresponding to the c code in this Op version = [19] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply( scalar_node = Apply(
...@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1953,11 +1865,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{ {
%(reduce_fct)s %(reduce_fct)s
...@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1997,11 +1904,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{ {
%(reduce_fct)s %(reduce_fct)s
...@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2042,11 +1944,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{ {
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
...@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2169,12 +2066,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
...@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2221,11 +2112,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
X = (const %(in_type)s *)(((char *)X)+offset_X); X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int a = blockIdx.x; a < A; a += gridDim.x) for (int a = blockIdx.x; a < A; a += gridDim.x)
{ {
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y) for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
...@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2279,12 +2165,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
print(""" print("""
%(decl)s %(decl)s
{ {
if(warpSize<blockDim.x){
//TODO: set error code
Z[0] = -666;
return;
}
%(init)s %(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{ {
...@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2332,13 +2212,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
//TODO: set error code
Z[blockIdx.x * sZ0] = %(write_out)s(-666);
return;
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{ {
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
...@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2445,11 +2318,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
...@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2601,11 +2469,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
A = (const %(in_type)s *)(((char *)A)+offset_A); A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z); Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{ {
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
......
...@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -111,7 +111,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier(); local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) { if (index < blockDim.x*2) {
partialCumOp[index] %(op)s= partialCumOp[index - stride]; partialCumOp[index] %(op)s= partialCumOp[index - stride];
} }
} }
...@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -136,7 +136,7 @@ class GpuCumOp(GpuKernelBase, Op):
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) { for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier(); local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) { if (index + stride < blockDim.x*2) {
partialCumOp[index + stride] %(op)s= partialCumOp[index]; partialCumOp[index + stride] %(op)s= partialCumOp[index];
} }
} }
......
...@@ -57,7 +57,7 @@ def code_version(version): ...@@ -57,7 +57,7 @@ def code_version(version):
UNVERSIONED = () UNVERSIONED = ()
@code_version((1,)) @code_version((2,))
def inline_reduce(N, buf, pos, count, manner_fn): def inline_reduce(N, buf, pos, count, manner_fn):
""" """
Return C++ code for a function that reduces a contiguous buffer. Return C++ code for a function that reduces a contiguous buffer.
...@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn): ...@@ -89,37 +89,25 @@ def inline_reduce(N, buf, pos, count, manner_fn):
""" """
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf)) loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """ return """
{ {
// This function trashes buf[1..warpSize], // This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
if (%(pos)s < warpSize) if (%(pos)s < warpSize) {
{
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize) for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{ {
%(buf)s[%(pos)s] = %(loop_line)s; %(buf)s[%(pos)s] = %(loop_line)s;
} }
if (%(pos)s < 16) }
{ __syncthreads();
//reduce so that %(pos)s 0 has the sum of everything //reduce so that %(pos)s 0 has the reduction of everything
if(%(pos)s + 16 < %(N)s) for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
%(buf)s[%(pos)s] = %(r_16)s; if (%(pos)s < _n && %(pos)s + _n < %(N)s)
if(%(pos)s + 8 < %(N)s) %(buf)s[%(pos)s] = %(r_n)s;
%(buf)s[%(pos)s] = %(r_8)s; __syncthreads();
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
} }
} }
""" % locals() """ % locals()
...@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -205,7 +193,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
] ]
@code_version((2,)) @code_version((3,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b='', load_b='', dtype='float32'): b='', stride_b='', load_b='', dtype='float32'):
...@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -231,14 +219,6 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Index of executing thread. Index of executing thread.
count count
Number of executing threads. Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
manner_fn manner_fn
A function that accepts strings of arguments a and b, and A function that accepts strings of arguments a and b, and
returns c code for their reduction. returns c code for their reduction.
...@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -249,6 +229,14 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_init manner_init
A function that accepts strings of arguments a and return c A function that accepts strings of arguments a and return c
code for its initialization. code for its initialization.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
load_b
Optional, wrapper to read from b if b is provided.
dtype
Optional, the dtype of the output.
Notes Notes
----- -----
...@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -268,11 +256,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf) "%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype) ctype = gpuarray.dtype_to_ctype(dtype)
return """ return """
...@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -281,31 +265,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
%(ctype)s red = %(init)s; %(ctype)s red = %(init)s;
#pragma unroll 16 #pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){ for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) {
red = %(loop_line)s; red = %(loop_line)s;
} }
buf[%(pos)s] = red; buf[%(pos)s] = red;
__syncthreads(); __syncthreads();
if (%(pos)s < warpSize) if (%(pos)s < warpSize) {
{ for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) {
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line2)s; %(buf)s[%(pos)s] = %(loop_line2)s;
} }
if (%(pos)s < 16) }
{ __syncthreads();
//reduce so that %(pos)s 0 has the reduction of everything //reduce so that %(pos)s 0 has the reduction of everything
if(%(pos)s + 16 < %(N)s) for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
%(buf)s[%(pos)s] = %(r_16)s; if (%(pos)s < _n && %(pos)s + _n < %(N)s)
if(%(pos)s + 8 < %(N)s) %(buf)s[%(pos)s] = %(r_n)s;
%(buf)s[%(pos)s] = %(r_8)s; __syncthreads();
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
} }
} }
""" % locals() """ % locals()
......
...@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor): ...@@ -1144,7 +1144,7 @@ class GpuDiagonal(Subtensor):
# This is also in consistence with the interface of numpy.diagonal. # This is also in consistence with the interface of numpy.diagonal.
if slice_axis < stride_axis: if slice_axis < stride_axis:
stride_axis -= 1 stride_axis -= 1
new_dim_order = range(x[slicer].ndim) new_dim_order = list(range(x[slicer].ndim))
new_dim_order = tuple(new_dim_order[:stride_axis] + new_dim_order = tuple(new_dim_order[:stride_axis] +
new_dim_order[stride_axis + 1:] + new_dim_order[stride_axis + 1:] +
[stride_axis, ]) [stride_axis, ])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论