提交 da147c3c authored 作者: fo40225's avatar fo40225

add GLOBAL_MEM

fix static_cast<> can not use in OpenCL kernel code
上级 8e292493
......@@ -56,10 +56,10 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d;
DTYPE_INPUT_0 * data_col_ptr = data_col;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
......
......@@ -53,9 +53,9 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_INPUT_0 * data_col_ptr = data_col;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) {
......@@ -90,9 +90,9 @@ KERNEL void im2col_kernel(const ga_size n,
const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_INPUT_0 * data_col_ptr = data_col;
GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) {
......
......@@ -18,15 +18,15 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) {
......@@ -62,18 +62,18 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) {
......@@ -109,9 +109,9 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
ga_size hend = min(hstart + kernel_h, height + pad_h);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
ga_size wend = min(wstart + kernel_w, width + pad_w);
ga_size pool_size;
if (inc_pad) {
......@@ -126,7 +126,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
}
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size h=hstart; h < hend; ++h) {
......@@ -165,11 +165,11 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
ga_size dend = min(dstart + kernel_d, depth + pad_d);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
ga_size hend = min(hstart + kernel_h, height + pad_h);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
ga_size wend = min(wstart + kernel_w, width + pad_w);
ga_size pool_size;
if (inc_pad) {
......@@ -186,7 +186,7 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
}
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size d=dstart; d < dend; ++d) {
......
......@@ -24,7 +24,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_INPUT_1* gz_slice = gz + offset;
GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size ph=phstart; ph < phend; ++ph) {
......@@ -75,7 +75,7 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_INPUT_1* gz_slice = gz + offset;
GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_OUTPUT_0 collector = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) {
......
......@@ -26,8 +26,8 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_2* gx_slice = gx + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size h=hstart; h < hend; ++h) {
......@@ -74,8 +74,8 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_2* gx_slice = gx + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size d=dstart; d < dend; ++d) {
......
......@@ -23,8 +23,8 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_INPUT_1* z_slice = z + offset;
const DTYPE_INPUT_2* gz_slice = gz + offset;
GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size ph=phstart; ph < phend; ++ph) {
......@@ -67,8 +67,8 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_INPUT_1* z_slice = z + offset;
const DTYPE_INPUT_2* gz_slice = gz + offset;
GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_OUTPUT_0 gradient = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) {
......
......@@ -28,8 +28,8 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_1* ex_slice = ex + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart];
......@@ -79,8 +79,8 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_1* ex_slice = ex + offset;
GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论