提交 4ad3ed64 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Use scheduled calls for gpuarray pooling

上级 ad73ad3e
...@@ -209,11 +209,6 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads, ...@@ -209,11 +209,6 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
#section support_code #section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
// output shape for a given input padded shape, window shape and stride // output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \ #define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \ (IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
...@@ -262,23 +257,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -262,23 +257,12 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
} }
{ {
// scope for running kernel // scope for running kernel
size_t max_threads_dim;
int err; int err;
// get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
if (MAX_POOL) { if (MAX_POOL) {
err = max_pool2d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
...@@ -290,8 +274,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -290,8 +274,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
return 1; return 1;
} }
} else { } else {
err = ave_pool2d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
...@@ -306,10 +289,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -306,10 +289,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
} }
else if (ndims == 3) { else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block);
if (MAX_POOL) { if (MAX_POOL) {
err = max_pool3d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
...@@ -321,8 +302,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -321,8 +302,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
return 1; return 1;
} }
} else { } else {
err = ave_pool3d_kernel_call(1, &n_blocks, &threads_per_block, 0, err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
......
...@@ -101,16 +101,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, ...@@ -101,16 +101,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
} }
} }
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
PyGpuArrayObject *gz, PyGpuArrayObject *gz,
PyArrayObject *ws, PyArrayObject *ws,
...@@ -151,24 +143,13 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -151,24 +143,13 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
} }
size_t max_threads_dim;
int err; int err;
const size_t* z_dims = PyGpuArray_DIMS(gz); const size_t* z_dims = PyGpuArray_DIMS(gz);
const size_t* x_dims = PyGpuArray_DIMS(x); const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = ave_pool2d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3], z_dims[2], z_dims[3],
x->ga.data, gz->ga.data, x->ga.data, gz->ga.data,
...@@ -182,9 +163,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -182,9 +163,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
} }
} else if (ndims == 3) { } else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = ave_pool3d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4], z_dims[2], z_dims[3], z_dims[4],
x->ga.data, gz->ga.data, x->ga.data, gz->ga.data,
......
...@@ -137,24 +137,13 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -137,24 +137,13 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
} }
size_t max_threads_dim;
int err; int err;
const size_t* z_dims = PyGpuArray_DIMS(z); const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x); const size_t* x_dims = PyGpuArray_DIMS(x);
// get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block; err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool2d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, z->ga.data, gx->ga.data, x->ga.data, z->ga.data, gx->ga.data,
...@@ -169,9 +158,7 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -169,9 +158,7 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
} }
else if (ndims == 3) { else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4]; size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block; err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool3d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, z->ga.data, gx->ga.data, x->ga.data, z->ga.data, gx->ga.data,
......
...@@ -84,13 +84,6 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, ...@@ -84,13 +84,6 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
} }
} }
#section support_code
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int nkernels, const int nthreads) {
return (nkernels + nthreads - 1) / nthreads;
}
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
...@@ -136,24 +129,13 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -136,24 +129,13 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i)); p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
} }
size_t max_threads_dim;
int err; int err;
const size_t* z_dims = PyGpuArray_DIMS(z); const size_t* z_dims = PyGpuArray_DIMS(z);
const size_t* x_dims = PyGpuArray_DIMS(x); const size_t* x_dims = PyGpuArray_DIMS(x);
// Get the max threads per blocks
err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
return 1;
}
size_t threads_per_block = max_threads_dim;
if (ndims == 2) { if (ndims == 2) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool2d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3], z_dims[2], z_dims[3],
x->ga.data, z->ga.data, gz->ga.data, x->ga.data, z->ga.data, gz->ga.data,
...@@ -167,9 +149,7 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -167,9 +149,7 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
} }
} else if (ndims == 3) { } else if (ndims == 3) {
size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4]; size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
size_t n_blocks = GET_BLOCKS(num_kernels, threads_per_block); err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
err = max_pool3d_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4], z_dims[2], z_dims[3], z_dims[4],
x->ga.data, z->ga.data, gz->ga.data, x->ga.data, z->ga.data, gz->ga.data,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论