提交 0f9a53ba authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3651 from abergeron/port_speedup

Port the speedup from #3163 to the new backend.
...@@ -58,11 +58,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -58,11 +58,13 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
write_x = write_w(dtype_x) write_x = write_w(dtype_x)
write_b = write_w(dtype_b) write_b = write_w(dtype_b)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
type_x = gpuarray.dtype_to_ctype(work_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(work_b) type_b = gpuarray.dtype_to_ctype(dtype_b)
work_x = gpuarray.dtype_to_ctype(work_x)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias" kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename k_var = "k_xent_sm_1hot_bias_" + nodename
f = '' if dtype_x == 'float64' else 'f'
sio = StringIO() sio = StringIO()
print(""" print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N, KERNEL void %(kname)s(const ga_size M, const ga_size N,
...@@ -89,49 +91,88 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -89,49 +91,88 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
for (int row = blockIdx.x; row < M; row += gridDim.x){ for (int row = blockIdx.x; row < M; row += gridDim.x){
const %(type_x)s* x = x_data + xs0 * row; const %(type_x)s* x = x_data + xs0 * row;
const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
%(type_x)s* sm = sm_data + sms0 * row; %(type_x)s* sm = sm_data + sms0 * row;
%(type_x)s sum = 0.0; extern LOCAL_MEM %(work_x)s per_thread_values[];
int row_max_j = 0; LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
%(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]); LOCAL_MEM int row_max_threadIdx;
for (int j = 1; j < N; ++j)
%(work_x)s per_thread_row_max, per_thread_sum;
int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX
// compute separate per-thread maximums and argmaxes
per_thread_row_max = NAN;
per_thread_row_max_j = 0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
{ {
%(type_x)s row_ij = %(load_x)s(x[j*xs1]) + float row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
%(load_b)s(b[j*bs0]); per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
//todo: store to shared memory per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
} }
//compute the exp per_thread_values[threadIdx.x] = per_thread_row_max;
for (int j = 0; j < N; ++j)
{ local_barrier();
%(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]); if (threadIdx.x == 0) {
%(type_x)s sm_ij = exp(row_ij - row_max); row_max = NAN;
sum += sm_ij; row_max_threadIdx = 0;
sm[j * sms1] = %(write_x)s(sm_ij); for (int j = 0; j < blockDim.x; j++)
{
float per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
row_max = fmax%(f)s(per_thread_max, row_max);
}
} }
%(type_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j) local_barrier();
// The thread with the higest max writes out which of its
// values was the winner.
if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX
per_thread_sum = 0.0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
{ {
%(type_x)s __tmp = %(load_x)s(sm[j * sms1]); %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
__tmp *= sum_inv; %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
sm[j * sms1] = %(write_x)s(__tmp); per_thread_sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
} }
if ((y_idx >= N) || (y_idx < 0))
{ per_thread_values[threadIdx.x] = per_thread_sum;
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least... local_barrier();
if (threadIdx.x == 0) {
sum = 0.0;
for (int j = 0; j < blockDim.x; j++) {
sum += per_thread_values[j];
}
sum_inv = 1.0 / sum;
} }
else
{ local_barrier();
nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
- %(load_b)s(b[y_idx*bs0]) for (int j = threadIdx.x; j < N; j += blockDim.x) {
+ row_max sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
+ log(sum)); }
if (threadIdx.x == 0) {
const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0];
if ((y_idx >= N || y_idx < 0)) {
// raise some suspicion.
nll_data[row * nlls0] = %(write_x)s(0.0);
} else {
nll_data[row * nlls0] = %(write_x)s(
- %(load_x)s(x[y_idx * xs1])
- %(load_b)s(b[y_idx * bs0])
+ row_max + log%(f)s(sum));
}
} }
am_data[row*ams0] = row_max_j;
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -154,6 +195,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -154,6 +195,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype) typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype) typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
worksize_x = numpy.dtype(work_dtype(node.inputs[0].dtype)).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
...@@ -263,8 +305,9 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -263,8 +305,9 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
} }
} }
{ {
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)256), 1, 1}; size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
size_t threads_per_block[3] = {1, 1, 1}; size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
size_t n_shared = n_threads * %(worksize_x)s;
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s; ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s; ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s; ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
...@@ -289,7 +332,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -289,7 +332,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
(void *)&stride_SM0, (void *)&stride_SM1, (void *)&stride_SM0, (void *)&stride_SM1,
(void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset, (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
(void *)&stride_AM0}; (void *)&stride_AM0};
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
%(err_check)s %(err_check)s
%(sync)s %(sync)s
} }
...@@ -297,7 +340,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -297,7 +340,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (9,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论