提交 f3fc1f0f authored 作者: Frederic's avatar Frederic

Make GpuSoftmax work with bigger row.

上级 c2c34032
......@@ -70,7 +70,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
return """
{
// This function trashes buf[1..N], leaving the reduction result in buf[0].
// This function trashes buf[1..warpSize], leaving the reduction result in buf[0].
if (%(pos)s < warpSize)
{
......@@ -158,3 +158,118 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
'}',
'__syncthreads()',
]
@code_version((1,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float)
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a
and return c code for its initialization
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
init = manner_init("%(x)s[tx * %(stride_x)s]" % locals())
loop_line = manner_fn("%s[%s]" % (buf, pos),
manner_init("%s[i * %s]" % (x, stride_x)))
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """
{
// This function trashes buf[1..warpSize], leaving the reduction result in buf[0].
for (int tx = %(pos)s; tx<warpSize; tx += %(count)s){
%(buf)s[tx] = %(init)s;
}
__syncthreads();
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the sum of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count):
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a)
@code_version((1,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x,
sm, sm_stride,
threadPos, threadCount):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(float)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, buf2 contains un-normalized softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times.
:note2: We use __i as an int variable in a loop
"""
return [
#get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, threadPos, threadCount),
'__syncthreads()',
'float row_max = '+buf+'[0]',
'__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a),
'__syncthreads()',
'float row_sum = '+buf+'[0]',
'__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"%(sm)s[tx * %(sm_stride)s] = exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals(),
"}",
'__syncthreads()',
]
......@@ -7,7 +7,8 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max,
inline_reduce_sum,
inline_softmax)
inline_softmax,
inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
......@@ -350,8 +351,7 @@ class GpuSoftmax (GpuOp):
return shape
def c_code_cache_version(self):
#return ()
return (7,) + inline_softmax.code_version
return (8,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub):
x, = inp
......@@ -386,6 +386,7 @@ class GpuSoftmax (GpuOp):
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{
if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmax_%(nodename)s
<<<
n_blocks,
......@@ -403,24 +404,43 @@ class GpuSoftmax (GpuOp):
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax_%(nodename)s", cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
}else{
kSoftmax_fixed_shared%(nodename)s
<<<
n_blocks,
n_threads,
32 * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
}
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s", cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmax_%s" % nodename,
ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
......@@ -441,7 +461,23 @@ class GpuSoftmax (GpuOp):
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'threadIdx.x', 'blockDim.x'),
"__syncthreads()",
"}",
])
return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax()
......
......@@ -219,9 +219,7 @@ def test_softmax():
This is basic test for GpuSoftmax
We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED)
We use slower code when there isn't enough shared memory
"""
x = T.fmatrix('x')
......@@ -232,25 +230,17 @@ def test_softmax():
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
cuda.nnet.GpuSoftmax)
def cmp(n, m, catch=False):
def cmp(n, m):
"""Some old card won't accept the configuration arguments of
this implementation. For those cases set catch=True to skip
those errors.
"""
try:
#print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
except RuntimeError, e:
if not catch:
raise
# Different CUDA driver have different error message
assert (e.args[0].startswith(
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
#print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
#we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5)
......@@ -262,5 +252,9 @@ def test_softmax():
cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True)
# The GTX285 don't have enough shared memory.
cmp(4, 4074)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论