提交 da147c3c authored 作者: fo40225's avatar fo40225

add GLOBAL_MEM

fix static_cast<> can not use in OpenCL kernel code
上级 8e292493
...@@ -56,10 +56,10 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -56,10 +56,10 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
const ga_size d_offset = d_col * stride_d - pad_d; const ga_size d_offset = d_col * stride_d - pad_d;
DTYPE_INPUT_0 * data_col_ptr = data_col; GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) + data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col; h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset; GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += c_im * (height * width * depth) + data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset; h_offset * (width * depth) + w_offset * depth + d_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
......
...@@ -53,9 +53,9 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -53,9 +53,9 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size c_col = c_im * kernel_h * kernel_w; const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_INPUT_0 * data_col_ptr = data_col; GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset; GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset; data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) { for (ga_size j = 0; j < kernel_w; ++j) {
...@@ -90,9 +90,9 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -90,9 +90,9 @@ KERNEL void im2col_kernel(const ga_size n,
const ga_size c_col = c_im * kernel_h * kernel_w; const ga_size c_col = c_im * kernel_h * kernel_w;
const ga_size h_offset = h_col * stride_h - pad_h; const ga_size h_offset = h_col * stride_h - pad_h;
const ga_size w_offset = w_col * stride_w - pad_w; const ga_size w_offset = w_col * stride_w - pad_w;
DTYPE_INPUT_0 * data_col_ptr = data_col; GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset; GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
data_im_ptr += (c_im * height + h_offset) * width + w_offset; data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (ga_size i = 0; i < kernel_h; ++i) { for (ga_size i = 0; i < kernel_h; ++i) {
for (ga_size j = 0; j < kernel_w; ++j) { for (ga_size j = 0; j < kernel_w; ++j) {
......
...@@ -18,15 +18,15 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads, ...@@ -18,15 +18,15 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
const ga_size ph = (index / pooled_width) % pooled_height; const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels; const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels); const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h); ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height); const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w); ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width); const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
...@@ -62,18 +62,18 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -62,18 +62,18 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth; const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels; const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels); const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d); ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
const ga_size dend = min(dstart + kernel_d, depth); const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h); ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
const ga_size hend = min(hstart + kernel_h, height); const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w); ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
const ga_size wend = min(wstart + kernel_w, width); const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0); dstart = max(dstart, 0);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
...@@ -109,9 +109,9 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -109,9 +109,9 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
const ga_size ph = (index / pooled_width) % pooled_height; const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels; const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels); const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h); ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
ga_size hend = min(hstart + kernel_h, height + pad_h); ga_size hend = min(hstart + kernel_h, height + pad_h);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w); ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
ga_size wend = min(wstart + kernel_w, width + pad_w); ga_size wend = min(wstart + kernel_w, width + pad_w);
ga_size pool_size; ga_size pool_size;
if (inc_pad) { if (inc_pad) {
...@@ -126,7 +126,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -126,7 +126,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
} }
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
...@@ -165,11 +165,11 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads, ...@@ -165,11 +165,11 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth; const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels; const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels); const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d); ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
ga_size dend = min(dstart + kernel_d, depth + pad_d); ga_size dend = min(dstart + kernel_d, depth + pad_d);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h); ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
ga_size hend = min(hstart + kernel_h, height + pad_h); ga_size hend = min(hstart + kernel_h, height + pad_h);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w); ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
ga_size wend = min(wstart + kernel_w, width + pad_w); ga_size wend = min(wstart + kernel_w, width + pad_w);
ga_size pool_size; ga_size pool_size;
if (inc_pad) { if (inc_pad) {
...@@ -186,7 +186,7 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads, ...@@ -186,7 +186,7 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
} }
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
DTYPE_OUTPUT_0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
......
...@@ -24,7 +24,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, ...@@ -24,7 +24,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_INPUT_1* gz_slice = gz + offset; GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_OUTPUT_0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size ph=phstart; ph < phend; ++ph) { for (ga_size ph=phstart; ph < phend; ++ph) {
...@@ -75,7 +75,7 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, ...@@ -75,7 +75,7 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_INPUT_1* gz_slice = gz + offset; GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
DTYPE_OUTPUT_0 collector = 0; DTYPE_OUTPUT_0 collector = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) { for (ga_size pd=pdstart; pd < pdend; ++pd) {
......
...@@ -26,8 +26,8 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, ...@@ -26,8 +26,8 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_2* gx_slice = gx + offset; GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_OUTPUT_0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size h=hstart; h < hend; ++h) { for (ga_size h=hstart; h < hend; ++h) {
...@@ -74,8 +74,8 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, ...@@ -74,8 +74,8 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_2* gx_slice = gx + offset; GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
DTYPE_OUTPUT_0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size d=dstart; d < dend; ++d) { for (ga_size d=dstart; d < dend; ++d) {
......
...@@ -23,8 +23,8 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, ...@@ -23,8 +23,8 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
const DTYPE_INPUT_1* z_slice = z + offset; GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
const DTYPE_INPUT_2* gz_slice = gz + offset; GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_OUTPUT_0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size ph=phstart; ph < phend; ++ph) { for (ga_size ph=phstart; ph < phend; ++ph) {
...@@ -67,8 +67,8 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, ...@@ -67,8 +67,8 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width); const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width; const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
const DTYPE_INPUT_1* z_slice = z + offset; GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
const DTYPE_INPUT_2* gz_slice = gz + offset; GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
DTYPE_OUTPUT_0 gradient = 0; DTYPE_OUTPUT_0 gradient = 0;
for (ga_size pd=pdstart; pd < pdend; ++pd) { for (ga_size pd=pdstart; pd < pdend; ++pd) {
......
...@@ -28,8 +28,8 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, ...@@ -28,8 +28,8 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width; const ga_size offset = (n*channels + c) * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_1* ex_slice = ex + offset; GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart]; DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart];
...@@ -79,8 +79,8 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, ...@@ -79,8 +79,8 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
wstart = max(wstart, 0); wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width; const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_INPUT_0* x_slice = x + offset; GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
const DTYPE_INPUT_1* ex_slice = ex + offset; GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart]; DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart]; DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论