提交 f3fc1f0f authored 作者: Frederic's avatar Frederic

Make GpuSoftmax work with bigger row.

上级 c2c34032
...@@ -70,7 +70,7 @@ def inline_reduce(N, buf, pos, count, manner_fn): ...@@ -70,7 +70,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
return """ return """
{ {
// This function trashes buf[1..N], leaving the reduction result in buf[0]. // This function trashes buf[1..warpSize], leaving the reduction result in buf[0].
if (%(pos)s < warpSize) if (%(pos)s < warpSize)
{ {
...@@ -158,3 +158,118 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -158,3 +158,118 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
'}', '}',
'__syncthreads()', '__syncthreads()',
] ]
@code_version((1,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float)
:param pos: index of executing thread
:param count: number of executing threads
:param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:param manner_init: a function that accepts strings of arguments a
and return c code for its initialization
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
:note: buf should be in gpu shared memory, we access it many times.
"""
init = manner_init("%(x)s[tx * %(stride_x)s]" % locals())
loop_line = manner_fn("%s[%s]" % (buf, pos),
manner_init("%s[i * %s]" % (x, stride_x)))
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """
{
// This function trashes buf[1..warpSize], leaving the reduction result in buf[0].
for (int tx = %(pos)s; tx<warpSize; tx += %(count)s){
%(buf)s[tx] = %(init)s;
}
__syncthreads();
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the sum of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count):
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a)
@code_version((1,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x,
sm, sm_stride,
threadPos, threadCount):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(float)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, buf2 contains un-normalized softmax
:note: buf and buf2 should be in gpu shared memory, we access it many times.
:note2: We use __i as an int variable in a loop
"""
return [
#get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, threadPos, threadCount),
'__syncthreads()',
'float row_max = '+buf+'[0]',
'__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a),
'__syncthreads()',
'float row_sum = '+buf+'[0]',
'__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"%(sm)s[tx * %(sm_stride)s] = exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals(),
"}",
'__syncthreads()',
]
...@@ -7,7 +7,8 @@ from theano.sandbox.cuda import GpuOp ...@@ -7,7 +7,8 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max, from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max,
inline_reduce_sum, inline_reduce_sum,
inline_softmax) inline_softmax,
inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
...@@ -350,8 +351,7 @@ class GpuSoftmax (GpuOp): ...@@ -350,8 +351,7 @@ class GpuSoftmax (GpuOp):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (8,) + inline_softmax.code_version
return (7,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, = inp x, = inp
...@@ -386,6 +386,7 @@ class GpuSoftmax (GpuOp): ...@@ -386,6 +386,7 @@ class GpuSoftmax (GpuOp):
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0) if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{ {
if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmax_%(nodename)s kSoftmax_%(nodename)s
<<< <<<
n_blocks, n_blocks,
...@@ -403,6 +404,25 @@ class GpuSoftmax (GpuOp): ...@@ -403,6 +404,25 @@ class GpuSoftmax (GpuOp):
CudaNdarray_HOST_STRIDES(%(z)s)[0], CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1] CudaNdarray_HOST_STRIDES(%(z)s)[1]
); );
}else{
kSoftmax_fixed_shared%(nodename)s
<<<
n_blocks,
n_threads,
32 * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
}
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -410,7 +430,7 @@ class GpuSoftmax (GpuOp): ...@@ -410,7 +430,7 @@ class GpuSoftmax (GpuOp):
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks," "Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory", " %%d threads %%d bytes of shared memory",
"kSoftmax_%(nodename)s", cudaGetErrorString(err), "kSoftmax[_fixed_shared]%(nodename)s", cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes); n_blocks, n_threads, n_shared_bytes);
%(fail)s; %(fail)s;
} }
...@@ -420,7 +440,7 @@ class GpuSoftmax (GpuOp): ...@@ -420,7 +440,7 @@ class GpuSoftmax (GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
...@@ -442,6 +462,22 @@ class GpuSoftmax (GpuOp): ...@@ -442,6 +462,22 @@ class GpuSoftmax (GpuOp):
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'threadIdx.x', 'blockDim.x'),
"__syncthreads()",
"}",
])
return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
......
...@@ -219,9 +219,7 @@ def test_softmax(): ...@@ -219,9 +219,7 @@ def test_softmax():
This is basic test for GpuSoftmax This is basic test for GpuSoftmax
We check that we loop when their is too much block We check that we loop when their is too much block
We use slower code when there isn't enough shared memory
TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED)
""" """
x = T.fmatrix('x') x = T.fmatrix('x')
...@@ -232,25 +230,17 @@ def test_softmax(): ...@@ -232,25 +230,17 @@ def test_softmax():
assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
cuda.nnet.GpuSoftmax) cuda.nnet.GpuSoftmax)
def cmp(n, m, catch=False): def cmp(n, m):
"""Some old card won't accept the configuration arguments of """Some old card won't accept the configuration arguments of
this implementation. For those cases set catch=True to skip this implementation. For those cases set catch=True to skip
those errors. those errors.
""" """
try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
except RuntimeError, e:
if not catch:
raise
# Different CUDA driver have different error message
assert (e.args[0].startswith(
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5) cmp(2, 5)
...@@ -262,5 +252,9 @@ def test_softmax(): ...@@ -262,5 +252,9 @@ def test_softmax():
cmp(4, 1024) cmp(4, 1024)
cmp(4, 2000) cmp(4, 2000)
cmp(4, 2024) cmp(4, 2024)
#GTX285 don't have enough shared mem for this case. # The GTX285 don't have enough shared memory.
cmp(4, 4074, True) cmp(4, 4074)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp(2, 10000)
cmp(128, 16 * 1024)
cmp(128, 64 * 1024)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论