提交 dc49660f authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #4737 from huanzhang12/add-opencl-Images2Neibs

OpenCL support for Image2Neibs kernels
......@@ -68,48 +68,48 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
code = """
// a version that uses less registers but doesn't work in all cases.
KERNEL void %(kname)s(
const int nb_batch,
const int nb_stack,
const int height,
const int width,
const int c,
const int d,
const int step_x,
const int step_y,
const int grid_c,
const int grid_d,
const size_t stride0, const size_t stride1,
const size_t stride2, const size_t stride3,
const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const size_t out_s0, const size_t out_s1,
%(type_z)s * global_out, const size_t offset_out
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
const ga_int width,
const ga_int c,
const ga_int d,
const ga_int step_x,
const ga_int step_y,
const ga_int grid_c,
const ga_int grid_d,
const ga_size stride0, const ga_size stride1,
const ga_size stride2, const ga_size stride3,
GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
const ga_size out_s0, const ga_size out_s1,
GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
const ga_int wrap_centered_idx_shift_x = c/2;
const ga_int wrap_centered_idx_shift_y = d/2;
global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
for(ga_int tblock = GID_0*LDIM_2+LID_2;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
tblock+=GDIM_0*LDIM_2){
const ga_int b = tblock%%grid_d;
ga_int left = tblock/grid_d;
const ga_int a = left%%grid_c;
left = left/grid_c;
const int s = left%%nb_stack;
const ga_int s = left%%nb_stack;
left = left/nb_stack;
const int n = left;
const ga_int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*
ga_int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
int i = threadIdx.y; // loop over c
ga_int i = LID_1; // loop over c
{
int ten4_2 = i + a * step_x;
ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 )
......@@ -117,9 +117,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
else if (ten4_2 >= height)
ten4_2 -= height;
}
int j = threadIdx.x; // loop over d
ga_int j = LID_0; // loop over d
{
int ten4_3 = j + b * step_y;
ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 )
......@@ -128,12 +128,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ten4_3 -= width;
}
int ten4_idx = stride3*ten4_3 +
ga_int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col * out_s1 +
ga_int z_col = j + d * i;
ga_int z_idx = z_col * out_s1 +
z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
......@@ -155,49 +155,49 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
k_var = "k_multi_warp_" + nodename
code = """
KERNEL void %(kname)s(
const int nb_batch,
const int nb_stack,
const int height,
const int width,
const int c,
const int d,
const int step_x,
const int step_y,
const int grid_c,
const int grid_d,
const size_t stride0, const size_t stride1,
const size_t stride2, const size_t stride3,
const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const size_t out_s0, const size_t out_s1,
%(type_z)s * global_out, const size_t offset_out
const ga_int nb_batch,
const ga_int nb_stack,
const ga_int height,
const ga_int width,
const ga_int c,
const ga_int d,
const ga_int step_x,
const ga_int step_y,
const ga_int grid_c,
const ga_int grid_d,
const ga_size stride0, const ga_size stride1,
const ga_size stride2, const ga_size stride3,
GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
const ga_size out_s0, const ga_size out_s1,
GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
const ga_int wrap_centered_idx_shift_x = c/2;
const ga_int wrap_centered_idx_shift_y = d/2;
global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
for(ga_int tblock = GID_0*LDIM_2+LID_2;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
tblock+=GDIM_0*LDIM_2){
const ga_int b = tblock%%grid_d;
ga_int left = tblock/grid_d;
const ga_int a = left%%grid_c;
left = left/grid_c;
const int s = left%%nb_stack;
const ga_int s = left%%nb_stack;
left = left/nb_stack;
const int n = left;
const ga_int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*
ga_int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
// loop over c
for (int i = threadIdx.y; i < c; i+=blockDim.y)
for (ga_int i = LID_1; i < c; i+=LDIM_1)
{
int ten4_2 = i + a * step_x;
ga_int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 )
......@@ -206,9 +206,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ten4_2 -= height;
}
// loop over d
for (int j = threadIdx.x; j < d; j+=blockDim.x)
for (ga_int j = LID_0; j < d; j+=LDIM_0)
{
int ten4_3 = j + b * step_y;
ga_int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 )
......@@ -217,12 +217,12 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ten4_3 -= width;
}
int ten4_idx = stride3*ten4_3 +
ga_int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col * out_s1 +
ga_int z_col = j + d * i;
ga_int z_idx = z_col * out_s1 +
z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
......@@ -243,8 +243,6 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return kernels
def c_code(self, node, name, inp, out, sub):
if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError("cuda only")
dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype
dtype_neib_step = node.inputs[2].dtype
......@@ -410,9 +408,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
PyArray_GETPTR1(%(neib_step)s, 1);
size_t threads_per_block[3] = {d, c, 1};
//Their is a max of 512 threads per blocks
while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[1]>1)threads_per_block[1]--;
while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[0]>1)threads_per_block[0]--;
//get the max threads per blocks
size_t max_threads_dim;
int err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
%(fail)s;
}
while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--;
while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--;
//Make bigger block to have better memory access pattern and
//a higher core utilisation. for smaller patch size
......@@ -434,7 +438,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
}else{
fptr = &k_multi_warp_%(name)s;
}
// printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]);
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
......@@ -457,7 +461,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(void *)&stride_Z1,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
int err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
} // END NESTED SCOPE
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论