提交 4ad3ed64 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Use scheduled calls for gpuarray pooling

上级 ad73ad3e
...@@ -209,11 +209,6 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads, ...@@ -209,11 +209,6 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
#section support_code #section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
// output shape for a given input padded shape, window shape and stride // output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \ #define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \ (IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
...@@ -262,27 +257,16 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -262,27 +257,16 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
} }
{ {
// scope for running kernel // scope for running kernel
size_t max_threads_dim;
int err; int err;
// get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
if (MAX_POOL) { if (MAX_POOL) {
err = max_pool2d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3],
z_dims[0], z_dims[1], z_dims[2], z_dims[3], x_dims[2], x_dims[3],
x_dims[2], x_dims[3], x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], (*z)->ga.data);
(*z)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool2d_kernel %s.", "GpuPool: max_pool2d_kernel %s.",
...@@ -290,12 +274,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -290,12 +274,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
return 1; return 1;
} }
} else { } else {
err = ave_pool2d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3],
z_dims[0], z_dims[1], z_dims[2], z_dims[3], x_dims[2], x_dims[3],
x_dims[2], x_dims[3], x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], INC_PAD, SUM_MODE, (*z)->ga.data);
INC_PAD, SUM_MODE, (*z)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool2d_kernel %s.", "GpuPool: ave_pool2d_kernel %s.",
...@@ -306,14 +289,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -306,14 +289,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
} }
else if (ndims == 3) { else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
if (MAX_POOL) { if (MAX_POOL) {
err = max_pool3d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], x_dims[2], x_dims[3], x_dims[4],
x_dims[2], x_dims[3], x_dims[4], x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], (*z)->ga.data);
p[0], p[1], p[2], (*z)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool3d_kernel %s.", "GpuPool: max_pool3d_kernel %s.",
...@@ -321,13 +302,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -321,13 +302,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
return 1; return 1;
} }
} else { } else {
err = ave_pool3d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels, z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], x_dims[2], x_dims[3], x_dims[4],
x_dims[2], x_dims[3], x_dims[4], x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE, (*z)->ga.data);
INC_PAD, SUM_MODE, (*z)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool3d_kernel %s.", "GpuPool: ave_pool3d_kernel %s.",
......
...@@ -101,16 +101,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, ...@@ -101,16 +101,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
} }
} }
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *gz, PyGpuArrayObject *gz,
PyArrayObject *ws, PyArrayObject *ws,
...@@ -151,29 +143,18 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -151,29 +143,18 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
} }
size_t max_threads_dim;
int err; int err;
const size_t* z_dims = PyGpuArray_DIMS(gz); const size_t* z_dims = PyGpuArray_DIMS(gz);
const size_t* x_dims = PyGpuArray_DIMS(x); const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = ave_pool2d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0, x_dims[0], x_dims[1], x_dims[2], x_dims[3],
num_kernels, z_dims[2], z_dims[3],
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x->ga.data, gz->ga.data,
z_dims[2], z_dims[3], w[0], w[1], s[0], s[1], p[0], p[1],
x->ga.data, gz->ga.data, INC_PAD, SUM_MODE, (*gx)->ga.data);
w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*gx)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.", "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
...@@ -182,15 +163,13 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -182,15 +163,13 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
} }
} else if (ndims == 3) { } else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = ave_pool3d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0, x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
num_kernels, z_dims[2], z_dims[3], z_dims[4],
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x->ga.data, gz->ga.data,
z_dims[2], z_dims[3], z_dims[4], w[0], w[1], w[2], s[0], s[1], s[2],
x->ga.data, gz->ga.data, p[0], p[1], p[2], INC_PAD, SUM_MODE,
w[0], w[1], w[2], s[0], s[1], s[2], (*gx)->ga.data);
p[0], p[1], p[2], INC_PAD, SUM_MODE,
(*gx)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.", "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
......
...@@ -137,29 +137,18 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -137,29 +137,18 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
} }
size_t max_threads_dim;
int err; int err;
const size_t* z_dims = PyGpuArray_DIMS(z); const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x); const size_t* x_dims = PyGpuArray_DIMS(x);
// get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block; err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool2d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0, z_dims[0], z_dims[1], z_dims[2], z_dims[3],
num_kernels, x_dims[2], x_dims[3],
z_dims[0], z_dims[1], z_dims[2], z_dims[3], x->ga.data, z->ga.data, gx->ga.data,
x_dims[2], x_dims[3], w[0], w[1], s[0], s[1], p[0], p[1],
x->ga.data, z->ga.data, gx->ga.data, (*gz)->ga.data);
w[0], w[1], s[0], s[1], p[0], p[1],
(*gz)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.", "GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
...@@ -169,14 +158,12 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -169,14 +158,12 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
} }
else if (ndims == 3) { else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block; err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool3d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0, z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
num_kernels, x_dims[2], x_dims[3], x_dims[4],
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], x->ga.data, z->ga.data, gx->ga.data,
x_dims[2], x_dims[3], x_dims[4], w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
x->ga.data, z->ga.data, gx->ga.data, (*gz)->ga.data);
w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
(*gz)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.", "GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
......
...@@ -84,13 +84,6 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, ...@@ -84,13 +84,6 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
} }
} }
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
...@@ -136,29 +129,18 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -136,29 +129,18 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
} }
size_t max_threads_dim;
int err; int err;
const size_t* z_dims = PyGpuArray_DIMS(z); const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x); const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool2d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0, x_dims[0], x_dims[1], x_dims[2], x_dims[3],
num_kernels, z_dims[2], z_dims[3],
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x->ga.data, z->ga.data, gz->ga.data,
z_dims[2], z_dims[3], w[0], w[1], s[0], s[1], p[0], p[1],
x->ga.data, z->ga.data, gz->ga.data, (*gx)->ga.data);
w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool2d_grad_kernel %s.", "GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
...@@ -167,14 +149,12 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -167,14 +149,12 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
} }
} else if (ndims == 3) { } else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool3d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0, x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
num_kernels, z_dims[2], z_dims[3], z_dims[4],
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x->ga.data, z->ga.data, gz->ga.data,
z_dims[2], z_dims[3], z_dims[4], w[0], w[1], w[2], s[0], s[1], s[2],
x->ga.data, z->ga.data, gz->ga.data, p[0], p[1], p[2], (*gx)->ga.data);
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool3d_grad_kernel %s.", "GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论