add GLOBAL_MEM

fix static_cast<> can not use in OpenCL kernel code

add GLOBAL_MEM
da147c3c · fo40225 · 8e292493 · da147c3c · da147c3c · da147c3c
--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
@@ -56,10 +56,10 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
    const ga_size h_offset = h_col * stride_h - pad_h;
    const ga_size w_offset = w_col * stride_w - pad_w;
    const ga_size d_offset = d_col * stride_d - pad_d;
-    DTYPE_INPUT_0 * data_col_ptr = data_col;
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += c_col * (height_col * width_col * depth_col) +
      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
-    const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
    data_im_ptr += c_im * (height * width * depth) +
      h_offset * (width * depth) + w_offset * depth + d_offset;
    for (ga_size i = 0; i < kernel_h; ++i) {

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -53,9 +53,9 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
    const ga_size c_col = c_im * kernel_h * kernel_w;
    const ga_size h_offset = h_col * stride_h - pad_h;
    const ga_size w_offset = w_col * stride_w - pad_w;
-    DTYPE_INPUT_0 * data_col_ptr = data_col;
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
    for (ga_size i = 0; i < kernel_h; ++i) {
      for (ga_size j = 0; j < kernel_w; ++j) {
@@ -90,9 +90,9 @@ KERNEL void im2col_kernel(const ga_size n,
    const ga_size c_col = c_im * kernel_h * kernel_w;
    const ga_size h_offset = h_col * stride_h - pad_h;
    const ga_size w_offset = w_col * stride_w - pad_w;
-    DTYPE_INPUT_0 * data_col_ptr = data_col;
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
    for (ga_size i = 0; i < kernel_h; ++i) {
      for (ga_size j = 0; j < kernel_w; ++j) {

--- a/theano/gpuarray/pool.c
+++ b/theano/gpuarray/pool.c
@@ -18,15 +18,15 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
    const ga_size ph = (index / pooled_width) % pooled_height;
    const ga_size c = (index / pooled_width / pooled_height) % channels;
    const ga_size n = (index / pooled_width / pooled_height / channels);
-    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
    const ga_size wend = min(wstart + kernel_w, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);

    const ga_size offset = (n*channels + c) * height * width;
-    const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
    DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];

    for (ga_size h=hstart; h < hend; ++h) {
@@ -62,18 +62,18 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
-    ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
+    ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
    const ga_size dend = min(dstart + kernel_d, depth);
-    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
    const ga_size wend = min(wstart + kernel_w, width);
    dstart = max(dstart, 0);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);

    const ga_size offset = (n*channels + c) * depth * height * width;
-    const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
    DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];

    for (ga_size d=dstart; d < dend; ++d) {
@@ -109,9 +109,9 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
    const ga_size ph = (index / pooled_width) % pooled_height;
    const ga_size c = (index / pooled_width / pooled_height) % channels;
    const ga_size n = (index / pooled_width / pooled_height / channels);
-    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
    ga_size hend = min(hstart + kernel_h, height + pad_h);
-    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
    ga_size wend = min(wstart + kernel_w, width + pad_w);
    ga_size pool_size;
    if (inc_pad) {
@@ -126,7 +126,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
    }

    const ga_size offset = (n*channels + c) * height * width;
-    const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
    DTYPE_OUTPUT_0 collector = 0;

    for (ga_size h=hstart; h < hend; ++h) {
@@ -165,11 +165,11 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
-    ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
+    ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
    ga_size dend = min(dstart + kernel_d, depth + pad_d);
-    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
    ga_size hend = min(hstart + kernel_h, height + pad_h);
-    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
    ga_size wend = min(wstart + kernel_w, width + pad_w);
    ga_size pool_size;
    if (inc_pad) {
@@ -186,7 +186,7 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
    }

    const ga_size offset = (n*channels + c) * depth * height * width;
-    const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
    DTYPE_OUTPUT_0 collector = 0;

    for (ga_size d=dstart; d < dend; ++d) {

--- a/theano/gpuarray/pool_ave_grad.c
+++ b/theano/gpuarray/pool_ave_grad.c
@@ -24,7 +24,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);

    const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
-    const DTYPE_INPUT_1* gz_slice = gz + offset;
+    GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
    DTYPE_OUTPUT_0 collector = 0;

    for (ga_size ph=phstart; ph < phend; ++ph) {
@@ -75,7 +75,7 @@ KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);

    const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
-    const DTYPE_INPUT_1* gz_slice = gz + offset;
+    GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
    DTYPE_OUTPUT_0 collector = 0;

    for (ga_size pd=pdstart; pd < pdend; ++pd) {

--- a/theano/gpuarray/pool_grad_grad.c
+++ b/theano/gpuarray/pool_grad_grad.c
@@ -26,8 +26,8 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,

    const ga_size offset = (n*channels + c) * height * width;

-    const DTYPE_INPUT_0* x_slice = x + offset;
-    const DTYPE_INPUT_2* gx_slice = gx + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
    DTYPE_OUTPUT_0 gradient = 0;

    for (ga_size h=hstart; h < hend; ++h) {
@@ -74,8 +74,8 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,

    const ga_size offset = (n*channels + c) * depth * height * width;

-    const DTYPE_INPUT_0* x_slice = x + offset;
-    const DTYPE_INPUT_2* gx_slice = gx + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
    DTYPE_OUTPUT_0 gradient = 0;

    for (ga_size d=dstart; d < dend; ++d) {

--- a/theano/gpuarray/pool_max_grad.c
+++ b/theano/gpuarray/pool_max_grad.c
@@ -23,8 +23,8 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);

    const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
-    const DTYPE_INPUT_1* z_slice = z + offset;
-    const DTYPE_INPUT_2* gz_slice = gz + offset;
+    GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
+    GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
    DTYPE_OUTPUT_0 gradient = 0;

    for (ga_size ph=phstart; ph < phend; ++ph) {
@@ -67,8 +67,8 @@ KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);

    const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
-    const DTYPE_INPUT_1* z_slice = z + offset;
-    const DTYPE_INPUT_2* gz_slice = gz + offset;
+    GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
+    GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
    DTYPE_OUTPUT_0 gradient = 0;

    for (ga_size pd=pdstart; pd < pdend; ++pd) {

--- a/theano/gpuarray/pool_max_rop.c
+++ b/theano/gpuarray/pool_max_rop.c
@@ -28,8 +28,8 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
    wstart = max(wstart, 0);

    const ga_size offset = (n*channels + c) * height * width;
-    const DTYPE_INPUT_0* x_slice = x + offset;
-    const DTYPE_INPUT_1* ex_slice = ex + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
    DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
    DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart];

@@ -79,8 +79,8 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
    wstart = max(wstart, 0);

    const ga_size offset = (n*channels + c) * depth * height * width;
-    const DTYPE_INPUT_0* x_slice = x + offset;
-    const DTYPE_INPUT_1* ex_slice = ex + offset;
+    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
+    GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
    DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
    DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart];