提交 8691b31a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix offsets for pooling.

上级 26890672
#section kernels #section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, * : #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_kernel(const ga_size nthreads, KERNEL void max_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w, const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -41,18 +43,20 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads, ...@@ -41,18 +43,20 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_kernel(const ga_size nthreads, KERNEL void max_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h, const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -90,17 +94,19 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -90,17 +94,19 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
} }
} }
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, * : #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, *, size:
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_kernel(const ga_size nthreads, KERNEL void ave_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w, const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -143,20 +149,22 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -143,20 +149,22 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
} }
} }
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, size, * : #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_kernel(const ga_size nthreads, KERNEL void ave_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h, const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
// grid stride looping // grid stride looping
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
index += LDIM_0 * GDIM_0) { index += LDIM_0 * GDIM_0) {
...@@ -273,8 +281,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -273,8 +281,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], x->ga.data, x->ga.offset, w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data); (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool2d_kernel %s.", "GpuPool: max_pool2d_kernel %s.",
...@@ -285,8 +293,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -285,8 +293,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], x->ga.data, x->ga.offset,
INC_PAD, SUM_MODE, (*z)->ga.data); w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool2d_kernel %s.", "GpuPool: ave_pool2d_kernel %s.",
...@@ -301,8 +311,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -301,8 +311,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], x->ga.data, x->ga.offset, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data); p[0], p[1], p[2], (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool3d_kernel %s.", "GpuPool: max_pool3d_kernel %s.",
...@@ -313,9 +323,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -313,9 +323,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], x->ga.data, x->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], p[0], p[1], p[2],
INC_PAD, SUM_MODE, (*z)->ga.data); INC_PAD, SUM_MODE,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool3d_kernel %s.", "GpuPool: ave_pool3d_kernel %s.",
......
#section kernels #section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, * : #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height, const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width, const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode, const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *gx) GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -46,19 +49,22 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, ...@@ -46,19 +49,22 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
} }
} }
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, size, size, * : #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth, const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth, const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx) const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -152,9 +158,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -152,9 +158,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3], z_dims[2], z_dims[3],
x->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*gx)->ga.data); INC_PAD, SUM_MODE,
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.", "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
...@@ -166,10 +174,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -166,10 +174,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4], z_dims[2], z_dims[3], z_dims[4],
x->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE, p[0], p[1], p[2], INC_PAD, SUM_MODE,
(*gx)->ga.data); (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.", "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
......
#section kernels #section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * : #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gz) GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((char *)z) + z_off);
gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((char *)gx) + gx_off);
gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gz) + gz_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -42,18 +46,22 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, ...@@ -42,18 +46,22 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gz) GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((char *)z) + z_off);
gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((char *)gx) + gx_off);
gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gz) + gz_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -146,9 +154,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -146,9 +154,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, z->ga.data, gx->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gx->ga.data, gx->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
(*gz)->ga.data); (*gz)->ga.data, (*gz)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.", "GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
...@@ -161,9 +171,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -161,9 +171,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, z->ga.data, gx->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gx->ga.data, gx->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
(*gz)->ga.data); (*gz)->ga.data, (*gz)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.", "GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
......
#section kernels #section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * : #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height, const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width, const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx) const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)z) + z_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -38,19 +42,23 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, ...@@ -38,19 +42,23 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth, const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth, const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gx) GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)z) + z_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -138,9 +146,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -138,9 +146,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3], z_dims[2], z_dims[3],
x->ga.data, z->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data); (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool2d_grad_kernel %s.", "GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
...@@ -152,9 +162,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -152,9 +162,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4], z_dims[2], z_dims[3], z_dims[4],
x->ga.data, z->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data); p[0], p[1], p[2], (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool3d_grad_kernel %s.", "GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
......
#section kernels #section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * : #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((char *)x) + x_off);
ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((char *)ex) + ex_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -46,19 +49,22 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, ...@@ -46,19 +49,22 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size x_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((char *)x) + x_off);
ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((char *)ex) + ex_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -167,9 +173,10 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x, ...@@ -167,9 +173,10 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, ex->ga.data, x->ga.data, x->ga.offset,
ex->ga.data, ex->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data); (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool2d_rop_kernel %s.", "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
...@@ -182,9 +189,11 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x, ...@@ -182,9 +189,11 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, ex->ga.data, x->ga.data, x->ga.offset,
ex->ga.data, ex->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data); p[0], p[1], p[2],
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool3d_rop_kernel %s.", "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论