提交 72823c46 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5802 from affanv14/macro

change macro names
...@@ -499,7 +499,7 @@ class CGpuKernelBase(COp, GpuKernelBase): ...@@ -499,7 +499,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
undef_macros = [] undef_macros = []
for i, v in enumerate(node.inputs): for i, v in enumerate(node.inputs):
if isinstance(v.type, GpuArrayType): if isinstance(v.type, GpuArrayType):
macro_name = "DTYPE_i%d" % (i,) macro_name = "DTYPE_INPUT_%d" % (i,)
macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype) macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
define_macros.append( define_macros.append(
define_template % define_template %
...@@ -507,7 +507,7 @@ class CGpuKernelBase(COp, GpuKernelBase): ...@@ -507,7 +507,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
undef_macros.append(undef_template % macro_name) undef_macros.append(undef_template % macro_name)
for i, v in enumerate(node.outputs): for i, v in enumerate(node.outputs):
if isinstance(v.type, GpuArrayType): if isinstance(v.type, GpuArrayType):
macro_name = "DTYPE_o%d" % (i,) macro_name = "DTYPE_OUTPUT_%d" % (i,)
macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype) macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
define_macros.append( define_macros.append(
define_template % define_template %
......
...@@ -34,7 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ...@@ -34,7 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Kernels for fast unfold + copy // Kernels for fast unfold + copy
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
KERNEL void dilated_im3d2col_kernel(const ga_size n, KERNEL void dilated_im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset, const ga_size data_im_offset,
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
...@@ -42,7 +42,7 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -42,7 +42,7 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -56,10 +56,10 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -56,10 +56,10 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d; const ga_size d_offset = d_col * stride_d - pad_d;
DTYPE_i0 * data_col_ptr = data_col; DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) + data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col; h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset; const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) + data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset; h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
...@@ -82,14 +82,14 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -82,14 +82,14 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * :
KERNEL void im3d2col_kernel(const ga_size n, KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset, const ga_size data_im_offset,
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -103,10 +103,10 @@ KERNEL void im3d2col_kernel(const ga_size n, ...@@ -103,10 +103,10 @@ KERNEL void im3d2col_kernel(const ga_size n,
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d; const ga_size d_offset = d_col * stride_d - pad_d;
DTYPE_i0 * data_col_ptr = data_col; DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) + data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col; h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset; const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) + data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset; h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
...@@ -128,7 +128,7 @@ KERNEL void im3d2col_kernel(const ga_size n, ...@@ -128,7 +128,7 @@ KERNEL void im3d2col_kernel(const ga_size n,
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void dilated_col2im3d_kernel(const ga_size n, KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
...@@ -136,12 +136,12 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n, ...@@ -136,12 +136,12 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0; DTYPE_INPUT_0 val = 0;
const ga_size d_im = index % depth + pad_d; const ga_size d_im = index % depth + pad_d;
const ga_size w_index = index / depth; const ga_size w_index = index / depth;
const ga_size w_im = w_index % width + pad_w; const ga_size w_im = w_index % width + pad_w;
...@@ -190,19 +190,19 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n, ...@@ -190,19 +190,19 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void col2im3d_kernel(const ga_size n, KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_i0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0; DTYPE_INPUT_0 val = 0;
const ga_size d_im = index % depth + pad_d; const ga_size d_im = index % depth + pad_d;
const ga_size w_index = index / depth; const ga_size w_index = index / depth;
const ga_size w_im = w_index % width + pad_w; const ga_size w_im = w_index % width + pad_w;
......
...@@ -34,7 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ...@@ -34,7 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Kernels for fast unfold + copy // Kernels for fast unfold + copy
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
KERNEL void dilated_im2col_kernel(const ga_size n, KERNEL void dilated_im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset, const ga_size data_im_offset,
const ga_size height, const ga_size width, const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
...@@ -42,7 +42,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -42,7 +42,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -53,9 +53,9 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -53,9 +53,9 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size c_col = c_im * kernel_h * kernel_w; const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_i0 * data_col_ptr = data_col; DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset; const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset; data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) { for (ga_size j = 0; j < kernel_w; ++j) {
...@@ -72,14 +72,14 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -72,14 +72,14 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * : #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * :
KERNEL void im2col_kernel(const ga_size n, KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset, const ga_size data_im_offset,
const ga_size height, const ga_size width, const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -90,9 +90,9 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -90,9 +90,9 @@ KERNEL void im2col_kernel(const ga_size n,
const ga_size c_col = c_im * kernel_h * kernel_w; const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_i0 * data_col_ptr = data_col; DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_i0 * data_im_ptr = data_im + data_im_offset; const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset; data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) { for (ga_size j = 0; j < kernel_w; ++j) {
...@@ -110,19 +110,19 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -110,19 +110,19 @@ KERNEL void im2col_kernel(const ga_size n,
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void dilated_col2im_kernel(const ga_size n, KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0; DTYPE_INPUT_0 val = 0;
const ga_size w_im = index % width + pad_w; const ga_size w_im = index % width + pad_w;
const ga_size h_im = (index / width) % height + pad_h; const ga_size h_im = (index / width) % height + pad_h;
const ga_size c_im = index / (width * height); const ga_size c_im = index / (width * height);
...@@ -155,18 +155,18 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -155,18 +155,18 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void col2im_kernel(const ga_size n, KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_i0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_i0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
DTYPE_i0 val = 0; DTYPE_INPUT_0 val = 0;
const ga_size w_im = index % width + pad_w; const ga_size w_im = index % width + pad_w;
const ga_size h_im = (index / width) % height + pad_h; const ga_size h_im = (index / width) % height + pad_h;
const ga_size c_im = index / (width * height); const ga_size c_im = index / (width * height);
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
KERNEL void max_pool2d_kernel(const ga_size nthreads, KERNEL void max_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, const ga_size kernel_h, const ga_size kernel_w, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w, const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -26,8 +26,8 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads, ...@@ -26,8 +26,8 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_o0 maxval = x_slice[hstart*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) { for (ga_size w=wstart; w < wend; ++w) {
...@@ -48,10 +48,10 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -48,10 +48,10 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, const ga_size kernel_d, const ga_size kernel_h, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h, const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -73,8 +73,8 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -73,8 +73,8 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_o0 maxval = x_slice[(dstart*height + hstart)*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
...@@ -96,10 +96,10 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -96,10 +96,10 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
KERNEL void ave_pool2d_kernel(const ga_size nthreads, KERNEL void ave_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, const ga_size kernel_h, const ga_size kernel_w, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w, const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_o0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -126,8 +126,8 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -126,8 +126,8 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
} }
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_o0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) { for (ga_size w=wstart; w < wend; ++w) {
...@@ -150,11 +150,11 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads, ...@@ -150,11 +150,11 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, const ga_size kernel_d, const ga_size kernel_h, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h, const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_o0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -186,8 +186,8 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads, ...@@ -186,8 +186,8 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
} }
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_o0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
......
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height, const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width, const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode, const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_o0 *gx) GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -24,8 +24,8 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, ...@@ -24,8 +24,8 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_i1* gz_slice = gz + offset; const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_o0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size ph=phstart; ph < phend; ++ph) { for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) { for (ga_size pw=pwstart; pw < pwend; ++pw) {
...@@ -53,11 +53,11 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, ...@@ -53,11 +53,11 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth, const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth, const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_o0 *gx) const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -75,8 +75,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, ...@@ -75,8 +75,8 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_i1* gz_slice = gz + offset; const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_o0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) { for (ga_size pd=pdstart; pd < pdend; ++pd) {
for (ga_size ph=phstart; ph < phend; ++ph) { for (ga_size ph=phstart; ph < phend; ++ph) {
......
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gx, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *gz) GLOBAL_MEM DTYPE_OUTPUT_0 *gz)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -26,9 +26,9 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, ...@@ -26,9 +26,9 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_i2* gx_slice = gx + offset; const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_o0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) { for (ga_size w=wstart; w < wend; ++w) {
...@@ -48,11 +48,11 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, ...@@ -48,11 +48,11 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gx, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *gz) GLOBAL_MEM DTYPE_OUTPUT_0 *gz)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -74,9 +74,9 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, ...@@ -74,9 +74,9 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_i2* gx_slice = gx + offset; const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_o0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height, const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width, const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_o0 *gx) const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -23,9 +23,9 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, ...@@ -23,9 +23,9 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_i1* z_slice = z + offset; const DTYPE_INPUT_1* z_slice = z + offset;
const DTYPE_i2* gz_slice = gz + offset; const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_o0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size ph=phstart; ph < phend; ++ph) { for (ga_size ph=phstart; ph < phend; ++ph) {
for (ga_size pw=pwstart; pw < pwend; ++pw) { for (ga_size pw=pwstart; pw < pwend; ++pw) {
...@@ -45,11 +45,11 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, ...@@ -45,11 +45,11 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth, const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth, const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *gx) GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -67,9 +67,9 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, ...@@ -67,9 +67,9 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_i1* z_slice = z + offset; const DTYPE_INPUT_1* z_slice = z + offset;
const DTYPE_i2* gz_slice = gz + offset; const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_o0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) { for (ga_size pd=pdstart; pd < pdend; ++pd) {
for (ga_size ph=phstart; ph < phend; ++ph) { for (ga_size ph=phstart; ph < phend; ++ph) {
......
...@@ -6,11 +6,11 @@ ...@@ -6,11 +6,11 @@
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -28,10 +28,10 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, ...@@ -28,10 +28,10 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset; const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[hstart*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
DTYPE_o0 collector = ex_slice[hstart*width + wstart]; DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) { for (ga_size w=wstart; w < wend; ++w) {
...@@ -53,11 +53,11 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, ...@@ -53,11 +53,11 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex, GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z)
{ {
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
...@@ -79,10 +79,10 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, ...@@ -79,10 +79,10 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset; const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset; const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[(dstart*height + hstart)*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_o0 collector = ex_slice[(dstart*height + hstart)*width + wstart]; DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
has to match the kernel function name below. has to match the kernel function name below.
*/ */
KERNEL void eye(GLOBAL_MEM DTYPE_o0 *a, ga_size n, ga_size m) { KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m; ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) { for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1; a[i*m + i] = 1;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论