提交 4ad3ed64 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Use scheduled calls for gpuarray pooling

上级 ad73ad3e
......@@ -209,11 +209,6 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
// output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
......@@ -262,27 +257,16 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
}
{
// scope for running kernel
size_t max_threads_dim;
int err;
// get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
if (MAX_POOL) {
err = max_pool2d_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data);
err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool2d_kernel %s.",
......@@ -290,12 +274,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
return 1;
}
} else {
err = ave_pool2d_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*z)->ga.data);
err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool2d_kernel %s.",
......@@ -306,14 +289,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
if (MAX_POOL) {
err = max_pool3d_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data);
err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool3d_kernel %s.",
......@@ -321,13 +302,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
return 1;
}
} else {
err = ave_pool3d_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2],
INC_PAD, SUM_MODE, (*z)->ga.data);
err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2],
INC_PAD, SUM_MODE, (*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool3d_kernel %s.",
......
......@@ -101,16 +101,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
}
}
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
#section support_code_struct
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *gz,
PyArrayObject *ws,
......@@ -151,29 +143,18 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
size_t max_threads_dim;
int err;
const size_t* z_dims = PyGpuArray_DIMS(gz);
const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
err = ave_pool2d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, gz->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*gx)->ga.data);
err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, gz->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
......@@ -182,15 +163,13 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
}
} else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
err = ave_pool3d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, gz->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE,
(*gx)->ga.data);
err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, gz->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE,
(*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
......
......@@ -137,29 +137,18 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
size_t max_threads_dim;
int err;
const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x);
// get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
err = max_pool2d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, z->ga.data, gx->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gz)->ga.data);
err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, z->ga.data, gx->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gz)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
......@@ -169,14 +158,12 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
err = max_pool3d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, z->ga.data, gx->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
(*gz)->ga.data);
err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, z->ga.data, gx->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
(*gz)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
......
......@@ -84,13 +84,6 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
}
}
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
#section support_code_struct
int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
......@@ -136,29 +129,18 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
}
size_t max_threads_dim;
int err;
const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
err = max_pool2d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, z->ga.data, gz->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data);
err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3],
x->ga.data, z->ga.data, gz->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
......@@ -167,14 +149,12 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
}
} else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
err = max_pool3d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, z->ga.data, gz->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data);
err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4],
x->ga.data, z->ga.data, gz->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论