提交 9eb54e01 authored 作者: xiaoqie's avatar xiaoqie

Port Softmax to OpenCL

上级 608e9aef
......@@ -65,41 +65,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename
f = '' if dtype_x == 'float64' else 'f'
if node.inputs[0].type.context.kind != b'cuda':
f = ''
else:
f = '' if dtype_x == 'float64' else 'f'
params = [
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
]
sio = StringIO()
print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x,
const ga_ssize xs0, const ga_ssize xs1,
const %(type_b)s* b, const ga_size offset_b,
const ga_ssize bs0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
const ga_ssize y_idxs0,
%(type_x)s* nll_data, const ga_size offset_nll,
const ga_ssize nlls0,
%(type_x)s* sm_data, const ga_size offset_sm,
const ga_ssize sms0, const ga_ssize sms1,
%(type_y_idx)s* am_data, const ga_size offset_am,
const ga_ssize ams0)
GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
{
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b);
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
for (int row = blockIdx.x; row < M; row += gridDim.x){
for (ga_int row = GID_0; row < M; row += GDIM_0){
const %(type_x)s* x = x_data + xs0 * row;
%(type_x)s* sm = sm_data + sms0 * row;
GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
extern LOCAL_MEM %(work_x)s per_thread_values[];
GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
LOCAL_MEM int row_max_threadIdx;
LOCAL_MEM ga_int row_max_threadIdx;
%(work_x)s per_thread_row_max, per_thread_sum;
int per_thread_row_max_j;
ga_int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX
......@@ -107,20 +113,20 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
per_thread_row_max = NAN;
per_thread_row_max_j = 0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
for (ga_int j = LID_0; j < N; j += LDIM_0)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
}
per_thread_values[threadIdx.x] = per_thread_row_max;
per_thread_values[LID_0] = per_thread_row_max;
local_barrier();
if (threadIdx.x == 0) {
if (LID_0 == 0) {
row_max = NAN;
row_max_threadIdx = 0;
for (int j = 0; j < blockDim.x; j++)
for (ga_int j = 0; j < LDIM_0; j++)
{
%(work_x)s per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
......@@ -132,11 +138,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
// The thread with the higest max writes out which of its
// values was the winner.
if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX
per_thread_sum = 0.0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
for (ga_int j = LID_0; j < N; j += LDIM_0)
{
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
%(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
......@@ -144,13 +150,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
sm[j * sms1] = %(write_x)s(sm_ij);
}
per_thread_values[threadIdx.x] = per_thread_sum;
per_thread_values[LID_0] = per_thread_sum;
local_barrier();
if (threadIdx.x == 0) {
if (LID_0 == 0) {
sum = 0.0;
for (int j = 0; j < blockDim.x; j++) {
for (ga_int j = 0; j < LDIM_0; j++) {
sum += per_thread_values[j];
}
sum_inv = 1.0 / sum;
......@@ -158,12 +164,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
local_barrier();
for (int j = threadIdx.x; j < N; j += blockDim.x) {
for (ga_int j = LID_0; j < N; j += LDIM_0) {
sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
}
if (threadIdx.x == 0) {
const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0];
if (LID_0 == 0) {
const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
if ((y_idx >= N || y_idx < 0)) {
// raise some suspicion.
nll_data[row * nlls0] = %(write_x)s(0.0);
......@@ -177,21 +183,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
}
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only')
itemsize_x = np.dtype(node.inputs[0].dtype).itemsize
worksize_x = np.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = np.dtype(node.inputs[1].dtype).itemsize
......@@ -266,7 +262,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue()
def c_code_cache_version(self):
return (12,)
return (13,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......@@ -292,14 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
return (12,)
return (13,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only")
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = np.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = np.dtype(node.inputs[1].dtype).itemsize
......@@ -429,30 +423,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
type_dx = gpuarray.dtype_to_ctype(dtype_dx)
kname = "kCrossEntropySoftmax1HotWithBiasDx"
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
params = [
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
]
sio = StringIO()
print("""
KERNEL void %(kname)s(
const ga_size N, const ga_size K,
const %(type_dnll)s* dnll, const ga_size offset_dnll,
const ga_ssize dnll_s0,
const %(type_sm)s* sm, const ga_size offset_sm,
const ga_ssize sm_s0, const ga_ssize sm_s1,
const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
const ga_ssize y_idx_s0,
%(type_dx)s* dx, const ga_size offset_dx,
const ga_ssize dx_s0, const ga_ssize dx_s1)
GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
{
dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
for (int i = blockIdx.x; i < N; i += gridDim.x)
for (ga_int i = GID_0; i < N; i += GDIM_0)
{
%(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
%(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
for (ga_int j = LID_0; j < K; j += LDIM_0)
{
if (y_i == j)
{
......@@ -470,13 +467,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
}
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
......@@ -499,14 +489,12 @@ class GpuSoftmax(GpuKernelBase, Op):
return shape
def c_code_cache_version(self):
return (15,) + inline_softmax.code_version
return (16,) + inline_softmax.code_version
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only")
dtype_x = node.inputs[0].dtype
work_x = work_dtype(dtype_x)
dtype_z = node.outputs[0].dtype
......@@ -607,67 +595,169 @@ class GpuSoftmax(GpuKernelBase, Op):
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(dtype_sm)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE
]
kernels = []
kname = "kSoftmax"
k_var = "kSoftmax_" + nodename
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=["extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
'blockDim.x', dtype=work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
code = """
KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x,
const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
LOCAL_MEM %(type_acc)s * buf2 = buf + N;
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0) {
for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
buf2[tx] = buf[tx];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0) {
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared"
k_var = "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=["extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
dtype=work_sm),
"__syncthreads()",
"}",
])
code = """
KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = max(red, %(load_x)s(x_ptr[i * sx1]));
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) - row_max);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = red + exp(%(load_x)s(x_ptr[i * sx1]) - row_max);
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) - row_max) / row_sum);
}
local_barrier();
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
......@@ -695,14 +785,12 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
return [shape[0]]
def c_code_cache_version(self):
return (14,) + inline_softmax.code_version
return (15,) + inline_softmax.code_version
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only')
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype
......@@ -821,74 +909,181 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
type_b = gpuarray.dtype_to_ctype(dtype_b)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(dtype_sm)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
]
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=["extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf[tx] += %s(b[tx * sb0])" % load_b,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
code = """
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
LOCAL_MEM %(type_acc)s * buf2 = buf + N;
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
buf[tx] += %(load_b)s(b[tx * sb0]);
buf2[tx] = buf[tx];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=["extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
'b', 'sb0', load_b, work_sm),
"__syncthreads()",
"}",
])
code = """
KERNEL void %(kname)s (const ga_size M, const ga_size N,
GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
{
GA_DECL_SHARED_BODY(%(type_acc)s, buf);
x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
}
local_barrier();
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论