提交 0641c6e3 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Handle array offsets in GpuCorrMM.

上级 c35ef4d8
#section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, * :
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
......@@ -35,6 +35,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GPU kernel for the case of dilation
KERNEL void dilated_im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset,
const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w,
......@@ -42,7 +43,10 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
......@@ -70,16 +74,20 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
}
}
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * :
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset,
const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
......@@ -108,9 +116,9 @@ KERNEL void im2col_kernel(const ga_size n,
}
// GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w,
......@@ -118,7 +126,10 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) {
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
......@@ -153,16 +164,19 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
}
}
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size :
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) {
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) {
......@@ -195,12 +209,12 @@ KERNEL void col2im_kernel(const ga_size n,
#section support_code_struct
int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w,
const size_t stride_h, const size_t stride_w,
gpudata * data_col) {
GpuArray *data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
......@@ -212,9 +226,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
if (dilation_h != 1 || dilation_w != 1) {
err = dilated_im2col_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im2col_kernel: %s.",
......@@ -223,9 +238,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
} else {
err = im2col_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im2col_kernel: %s.",
......@@ -235,11 +251,11 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
return err;
}
int col2im(gpudata * data_col, const size_t channels,
int col2im(GpuArray *data_col, const size_t channels,
const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t stride_h,
const size_t stride_w, gpudata * data_im, const size_t data_im_offset) {
const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
......@@ -251,9 +267,10 @@ int col2im(gpudata * data_col, const size_t channels,
if (dilation_h != 1 || dilation_w != 1) {
err = dilated_col2im_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w,
num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset);
height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im_kernel: %s.",
......@@ -262,9 +279,10 @@ int col2im(gpudata * data_col, const size_t channels,
} else {
err = col2im_kernel_scall(
1, &num_kernels, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w,
num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset);
height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im_kernel: %s.",
......@@ -433,10 +451,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im2col
err = im2col(bottom->ga.data, n * bottom_stride,
nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data);
err = im2col(&bottom->ga, n * bottom_stride,
nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
......@@ -495,10 +513,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch
for (size_t n = 0; n < batchSize; n++) {
// First, im2col
err = im2col(bottom->ga.data, n * bottom_stride,
err = im2col(&bottom->ga, n * bottom_stride,
nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data);
padH, padW, dH, dW, &col->ga);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
......@@ -595,9 +613,9 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL;
}
// col2im back to the data
err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth,
err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW,
dH, dW, bottom->ga.data, n * bottom_stride);
dH, dW, &bottom->ga, n * bottom_stride);
if (err != GA_NO_ERROR) {
Py_DECREF(col);
return NULL;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论