提交 9eb54e01 authored 作者: xiaoqie's avatar xiaoqie

Port Softmax to OpenCL

上级 608e9aef
...@@ -65,41 +65,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -65,41 +65,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias" kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename k_var = "k_xent_sm_1hot_bias_" + nodename
if node.inputs[0].type.context.kind != b'cuda':
f = ''
else:
f = '' if dtype_x == 'float64' else 'f' f = '' if dtype_x == 'float64' else 'f'
params = [
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
]
sio = StringIO() sio = StringIO()
print(""" print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N, KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x, GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
const ga_ssize xs0, const ga_ssize xs1, GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
const %(type_b)s* b, const ga_size offset_b, GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
const ga_ssize bs0, GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
const ga_ssize y_idxs0, GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
%(type_x)s* nll_data, const ga_size offset_nll, {
const ga_ssize nlls0, x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
%(type_x)s* sm_data, const ga_size offset_sm, b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
const ga_ssize sms0, const ga_ssize sms1, y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
%(type_y_idx)s* am_data, const ga_size offset_am, nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
const ga_ssize ams0) sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
{ am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b); for (ga_int row = GID_0; row < M; row += GDIM_0){
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll); GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm); GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
for (int row = blockIdx.x; row < M; row += gridDim.x){
const %(type_x)s* x = x_data + xs0 * row;
%(type_x)s* sm = sm_data + sms0 * row;
extern LOCAL_MEM %(work_x)s per_thread_values[];
LOCAL_MEM %(work_x)s row_max, sum, sum_inv; LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
LOCAL_MEM int row_max_threadIdx; LOCAL_MEM ga_int row_max_threadIdx;
%(work_x)s per_thread_row_max, per_thread_sum; %(work_x)s per_thread_row_max, per_thread_sum;
int per_thread_row_max_j; ga_int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX // COMPUTE ROW MAX AND ARGMAX
...@@ -107,20 +113,20 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -107,20 +113,20 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
per_thread_row_max = NAN; per_thread_row_max = NAN;
per_thread_row_max_j = 0; per_thread_row_max_j = 0;
for (int j = threadIdx.x; j < N; j += blockDim.x) for (ga_int j = LID_0; j < N; j += LDIM_0)
{ {
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j; per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max); per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
} }
per_thread_values[threadIdx.x] = per_thread_row_max; per_thread_values[LID_0] = per_thread_row_max;
local_barrier(); local_barrier();
if (threadIdx.x == 0) { if (LID_0 == 0) {
row_max = NAN; row_max = NAN;
row_max_threadIdx = 0; row_max_threadIdx = 0;
for (int j = 0; j < blockDim.x; j++) for (ga_int j = 0; j < LDIM_0; j++)
{ {
%(work_x)s per_thread_max = per_thread_values[j]; %(work_x)s per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx; row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
...@@ -132,11 +138,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -132,11 +138,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
// The thread with the higest max writes out which of its // The thread with the higest max writes out which of its
// values was the winner. // values was the winner.
if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j; if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX // COMPUTE SOFTMAX
per_thread_sum = 0.0; per_thread_sum = 0.0;
for (int j = threadIdx.x; j < N; j += blockDim.x) for (ga_int j = LID_0; j < N; j += LDIM_0)
{ {
%(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
%(work_x)s sm_ij = exp%(f)s(row_ij - row_max); %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
...@@ -144,13 +150,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -144,13 +150,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
sm[j * sms1] = %(write_x)s(sm_ij); sm[j * sms1] = %(write_x)s(sm_ij);
} }
per_thread_values[threadIdx.x] = per_thread_sum; per_thread_values[LID_0] = per_thread_sum;
local_barrier(); local_barrier();
if (threadIdx.x == 0) { if (LID_0 == 0) {
sum = 0.0; sum = 0.0;
for (int j = 0; j < blockDim.x; j++) { for (ga_int j = 0; j < LDIM_0; j++) {
sum += per_thread_values[j]; sum += per_thread_values[j];
} }
sum_inv = 1.0 / sum; sum_inv = 1.0 / sum;
...@@ -158,12 +164,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -158,12 +164,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
local_barrier(); local_barrier();
for (int j = threadIdx.x; j < N; j += blockDim.x) { for (ga_int j = LID_0; j < N; j += LDIM_0) {
sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv); sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
} }
if (threadIdx.x == 0) { if (LID_0 == 0) {
const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0]; const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
if ((y_idx >= N || y_idx < 0)) { if ((y_idx >= N || y_idx < 0)) {
// raise some suspicion. // raise some suspicion.
nll_data[row * nlls0] = %(write_x)s(0.0); nll_data[row * nlls0] = %(write_x)s(0.0);
...@@ -177,21 +183,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -177,21 +183,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params, return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only')
itemsize_x = np.dtype(node.inputs[0].dtype).itemsize itemsize_x = np.dtype(node.inputs[0].dtype).itemsize
worksize_x = np.dtype(work_dtype(node.inputs[0].dtype)).itemsize worksize_x = np.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = np.dtype(node.inputs[1].dtype).itemsize itemsize_b = np.dtype(node.inputs[1].dtype).itemsize
...@@ -266,7 +262,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -266,7 +262,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
...@@ -292,14 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -292,14 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only")
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = np.dtype(node.inputs[0].dtype).itemsize itemsize_dnll = np.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = np.dtype(node.inputs[1].dtype).itemsize itemsize_sm = np.dtype(node.inputs[1].dtype).itemsize
...@@ -429,30 +423,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -429,30 +423,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
type_dx = gpuarray.dtype_to_ctype(dtype_dx) type_dx = gpuarray.dtype_to_ctype(dtype_dx)
kname = "kCrossEntropySoftmax1HotWithBiasDx" kname = "kCrossEntropySoftmax1HotWithBiasDx"
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
params = [
gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
]
sio = StringIO() sio = StringIO()
print(""" print("""
KERNEL void %(kname)s( KERNEL void %(kname)s(
const ga_size N, const ga_size K, const ga_size N, const ga_size K,
const %(type_dnll)s* dnll, const ga_size offset_dnll, GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
const ga_ssize dnll_s0, GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
const %(type_sm)s* sm, const ga_size offset_sm, GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
const ga_ssize sm_s0, const ga_ssize sm_s1, GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, {
const ga_ssize y_idx_s0, dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
%(type_dx)s* dx, const ga_size offset_dx, sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
const ga_ssize dx_s0, const ga_ssize dx_s1) y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
{ dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
sm = (const %(type_sm)s *)(((char *)sm)+offset_sm); for (ga_int i = GID_0; i < N; i += GDIM_0)
y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
for (int i = blockIdx.x; i < N; i += gridDim.x)
{ {
%(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
%(type_y_idx)s y_i = y_idx[i * y_idx_s0]; %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x) for (ga_int j = LID_0; j < K; j += LDIM_0)
{ {
if (y_i == j) if (y_i == j)
{ {
...@@ -470,13 +467,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -470,13 +467,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params, return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
...@@ -499,14 +489,12 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -499,14 +489,12 @@ class GpuSoftmax(GpuKernelBase, Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (15,) + inline_softmax.code_version return (16,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only")
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
work_x = work_dtype(dtype_x) work_x = work_dtype(dtype_x)
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
...@@ -607,67 +595,169 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -607,67 +595,169 @@ class GpuSoftmax(GpuKernelBase, Op):
type_x = gpuarray.dtype_to_ctype(dtype_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm) type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(dtype_sm)
params = [ params = [
'uintp', 'uintp', gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
] ]
kernels = [] kernels = []
kname = "kSoftmax" kname = "kSoftmax"
k_var = "kSoftmax_" + nodename k_var = "kSoftmax_" + nodename
code = nvcc_kernel( code = """
kname, KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x,
params=['const ga_size M', 'const ga_size N', const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
'const %s * x' % type_x, 'const ga_size offset_x', {
'const ga_ssize sx0', 'const ga_ssize sx1', GA_DECL_SHARED_BODY(%(type_acc)s, buf);
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], LOCAL_MEM %(type_acc)s * buf2 = buf + N;
body=["extern __shared__ %s buf[]" % type_acc, x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
"%s * buf2 = buf + N" % type_acc, sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
"x = (const %s *)(((char *)x)+offset_x)" % type_x, for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0) {
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
"for (int blockIDX = blockIdx.x; blockIDX < M;" buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
" blockIDX += gridDim.x){", buf2[tx] = buf[tx];
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", }
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x, local_barrier();
"buf2[tx] = buf[tx]",
"}", {
"__syncthreads()", // This function trashes buf[1..GA_WARP_SIZE],
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', // leaving the reduction result in buf[0].
'blockDim.x', dtype=work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", if (LID_0 < GA_WARP_SIZE) {
# This set all value correctly for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm, {
"}", buf[LID_0] = max(buf[LID_0], buf[i]);
"__syncthreads()", }
"}", }
]) local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0) {
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared" kname = "kSoftmax_fixed_shared"
k_var = "kSoftmax_fixed_shared" + nodename k_var = "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel( code = """
kname, KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
params=['const ga_size M', 'const ga_size N', GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
'const %s * x' % type_x, 'const ga_size offset_x', {
'const ga_ssize sx0', 'const ga_ssize sx1', GA_DECL_SHARED_BODY(%(type_acc)s, buf);
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
body=["extern __shared__ %s buf[]" % type_acc, sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
"x = (const %s *)(((char *)x)+offset_x)" % type_x, for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
"for (int blockIDX = blockIdx.x; blockIDX < M;" GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x, {
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm, // This function trashes buf[1..n_threads],
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', // leaving the reduction result in buf[0].
load_x, %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]);
'sm_ptr', 'sm_s1', write_sm, #pragma unroll 16
'threadIdx.x', 'blockDim.x', for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
dtype=work_sm), red = max(red, %(load_x)s(x_ptr[i * sx1]));
"__syncthreads()", }
"}", buf[LID_0] = red;
]) local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) - row_max);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = red + exp(%(load_x)s(x_ptr[i * sx1]) - row_max);
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) - row_max) / row_sum);
}
local_barrier();
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
return kernels return kernels
...@@ -695,14 +785,12 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -695,14 +785,12 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (14,) + inline_softmax.code_version return (15,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('cuda only')
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
...@@ -821,74 +909,181 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -821,74 +909,181 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
type_b = gpuarray.dtype_to_ctype(dtype_b) type_b = gpuarray.dtype_to_ctype(dtype_b)
type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm) type_acc = gpuarray.dtype_to_ctype(work_sm)
ctype = gpuarray.dtype_to_ctype(dtype_sm)
params = [ params = [
'uintp', 'uintp', gpuarray.SIZE, gpuarray.SIZE,
gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
gpuarray.GpuArray, 'uintp', 'intp', 'intp' gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
] ]
kernels = [] kernels = []
kname = "kSoftmaxWithBias" kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel( code = """
kname, KERNEL void %(kname)s (const ga_size M, const ga_size N,
params=['const ga_size M', 'const ga_size N', GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
'const %s * x' % type_x, 'const ga_size offset_x', GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
'const ga_ssize sx0', 'const ga_ssize sx1', GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
'const %s * b' % type_b, 'const ga_size offset_b', {
'const ga_ssize sb0', GA_DECL_SHARED_BODY(%(type_acc)s, buf);
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], LOCAL_MEM %(type_acc)s * buf2 = buf + N;
body=["extern __shared__ %s buf[]" % type_acc, x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
"%s * buf2 = buf + N" % type_acc, b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
"x = (const %s *)(((char *)x)+offset_x)" % type_x, sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
"b = (const %s *)(((char *)b)+offset_b)" % type_b, for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
"for (int blockIDX = blockIdx.x; blockIDX < M;" buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
" blockIDX += gridDim.x){", buf[tx] += %(load_b)s(b[tx * sb0]);
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", buf2[tx] = buf[tx];
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x, }
"buf[tx] += %s(b[tx * sb0])" % load_b, local_barrier();
"buf2[tx] = buf[tx]",
"}", {
"__syncthreads()", // This function trashes buf[1..GA_WARP_SIZE],
inline_softmax('N', 'buf', 'buf2', // leaving the reduction result in buf[0].
'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", if (LID_0 < GA_WARP_SIZE) {
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm, for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
"}", {
"__syncthreads()", buf[LID_0] = max(buf[LID_0], buf[i]);
"}", }
]) }
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
buf[__i] = exp(buf2[__i] - row_max);
buf2[__i] = buf[__i];
}
local_barrier();
{
// This function trashes buf[1..GA_WARP_SIZE],
// leaving the reduction result in buf[0].
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
{
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = buf2[__i] / row_sum;
}
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
}
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared" kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = nvcc_kernel( code = """
kname, KERNEL void %(kname)s (const ga_size M, const ga_size N,
params=['const ga_size M', 'const ga_size N', GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
'const %s * x' % type_x, 'const ga_size offset_x', GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
'const ga_ssize sx0', 'const ga_ssize sx1', GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
'const %s * b' % type_b, 'const ga_size offset_b', {
'const ga_ssize sb0', GA_DECL_SHARED_BODY(%(type_acc)s, buf);
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
body=["extern __shared__ %s buf[]" % type_acc, b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
"x = (const %s *)(((char *)x)+offset_x)" % type_x, sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
"b = (const %s *)(((char *)b)+offset_b)" % type_b, for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
"for (int blockIDX = blockIdx.x; blockIDX < M;" GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x, {
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm, // This function trashes buf[1..n_threads],
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', // leaving the reduction result in buf[0].
load_x, %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
'sm_ptr', 'sm_s1', write_sm, #pragma unroll 16
'threadIdx.x', 'blockDim.x', for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
'b', 'sb0', load_b, work_sm), red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
"__syncthreads()", }
"}", buf[LID_0] = red;
]) local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = max(buf[LID_0], buf[i]);
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
local_barrier();
}
}
local_barrier();
%(ctype)s row_max = buf[0];
local_barrier();
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
%(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
#pragma unroll 16
for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
}
buf[LID_0] = red;
local_barrier();
if (LID_0 < GA_WARP_SIZE) {
for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
buf[LID_0] = buf[LID_0] + buf[i];
}
}
local_barrier();
//reduce so that LID_0 0 has the reduction of everything
for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
if (LID_0 < _n && LID_0 + _n < N)
buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
local_barrier();
}
}
local_barrier();
%(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
}
local_barrier();
local_barrier();
}
}
""" % locals()
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
return kernels return kernels
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论