提交 e5ba1b08 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6012 from abergeron/fix_offset

Fix offset problems in the new backend.
...@@ -1630,7 +1630,9 @@ class GpuEye(GpuKernelBase, Op): ...@@ -1630,7 +1630,9 @@ class GpuEye(GpuKernelBase, Op):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
code = """ code = """
KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) { KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((char *)a) + a_off);
ga_ssize coff = max(k, (ga_ssize) 0); ga_ssize coff = max(k, (ga_ssize) 0);
ga_ssize roff = -min(k, (ga_ssize) 0); ga_ssize roff = -min(k, (ga_ssize) 0);
ga_size nb = (ga_size) min(n - roff, m - coff); ga_size nb = (ga_size) min(n - roff, m - coff);
...@@ -1641,7 +1643,8 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) { ...@@ -1641,7 +1643,8 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
name=name, write_a=write_w(self.dtype)) name=name, write_a=write_w(self.dtype))
return [Kernel( return [Kernel(
code=code, name="eye", code=code, name="eye",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE], params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SIZE, gpuarray.SSIZE],
flags=Kernel.get_flags(self.dtype), flags=Kernel.get_flags(self.dtype),
objvar='k_eye_' + name)] objvar='k_eye_' + name)]
...@@ -1685,7 +1688,8 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) { ...@@ -1685,7 +1688,8 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
col_off = (size_t) (k > 0?k:0); col_off = (size_t) (k > 0?k:0);
row_off = (size_t) (k < 0?-k:0); row_off = (size_t) (k < 0?-k:0);
if (row_off < dims[0] && col_off < dims[1]) { if (row_off < dims[0] && col_off < dims[1]) {
err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, dims[0], dims[1], k); err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
dims[0], dims[1], k);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kEye: %%s. n%%lu, m=%%lu.", "gpuarray error: kEye: %%s. n%%lu, m=%%lu.",
...@@ -1702,4 +1706,4 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) { ...@@ -1702,4 +1706,4 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (8,)
...@@ -552,8 +552,8 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -552,8 +552,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
return [os.path.dirname(__file__)] return [os.path.dirname(__file__)]
def c_code_cache_version(self): def c_code_cache_version(self):
# Raise this whenever modifying the code below. # Raise this whenever modifying the C code (including the file).
return (7,) return (8,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
""" """
......
#section kernels #section kernels
#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : #kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags // TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/); // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of // sources are clearly marked. Below we reproduce the original license of
...@@ -35,14 +35,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ...@@ -35,14 +35,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
KERNEL void dilated_im3d2col_kernel(const ga_size n, KERNEL void dilated_im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset, const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d, const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -80,16 +86,22 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n, ...@@ -80,16 +86,22 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
} }
} }
#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void im3d2col_kernel(const ga_size n, KERNEL void im3d2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset, const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -126,9 +138,10 @@ KERNEL void im3d2col_kernel(const ga_size n, ...@@ -126,9 +138,10 @@ KERNEL void im3d2col_kernel(const ga_size n,
} }
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
KERNEL void dilated_col2im3d_kernel(const ga_size n, KERNEL void dilated_col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
...@@ -137,7 +150,12 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n, ...@@ -137,7 +150,12 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -188,9 +206,11 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n, ...@@ -188,9 +206,11 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
} }
} }
#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
KERNEL void col2im3d_kernel(const ga_size n, KERNEL void col2im3d_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width, const ga_size depth,
const ga_size channels, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
...@@ -198,7 +218,12 @@ KERNEL void col2im3d_kernel(const ga_size n, ...@@ -198,7 +218,12 @@ KERNEL void col2im3d_kernel(const ga_size n,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
const ga_size height_col, const ga_size width_col, const ga_size depth_col, const ga_size height_col, const ga_size width_col, const ga_size depth_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -236,16 +261,50 @@ KERNEL void col2im3d_kernel(const ga_size n, ...@@ -236,16 +261,50 @@ KERNEL void col2im3d_kernel(const ga_size n,
} }
} }
#section support_code
int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
size_t M, size_t N, size_t K, double alpha,
GpuArray *A, size_t offA, size_t lda,
GpuArray *B, size_t offB, size_t ldb,
double beta, GpuArray *C, size_t offC, size_t ldc) {
switch (A->typecode) {
case GA_FLOAT:
return gpublas_sgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 4) + offA, lda,
B->data, (B->offset / 4) + offB, ldb,
beta,
C->data, (C->offset / 4) + offC, ldc);
case GA_DOUBLE:
return gpublas_dgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 8) + offA, lda,
B->data, (B->offset / 8) + offB, ldb,
beta,
C->data, (C->offset / 8) + offC, ldc);
case GA_HALF:
return gpublas_hgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 2) + offA, lda,
B->data, (B->offset / 2) + offB, ldb,
beta,
C->data, (C->offset / 2) + offC, ldc);
default:
return GA_UNSUPPORTED_ERROR;
}
}
#section support_code_struct #section support_code_struct
int im3d2col( int im3d2col(
gpudata * data_im, const size_t data_im_offset, const size_t channels, GpuArray *data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t depth, const size_t height, const size_t width, const size_t depth,
const size_t kernel_h, const size_t kernel_w, const size_t kernel_d, const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d, const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
const size_t pad_h, const size_t pad_w, const size_t pad_d, const size_t pad_h, const size_t pad_w, const size_t pad_d,
const size_t stride_h, const size_t stride_w, const size_t stride_d, const size_t stride_h, const size_t stride_w, const size_t stride_d,
gpudata * data_col) { GpuArray *data_col) {
// We are going to launch channels * height_col * width_col * depth_col // We are going to launch channels * height_col * width_col * depth_col
// kernels, each kernel responsible for copying a single-channel grid. // kernels, each kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1; size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
...@@ -259,10 +318,11 @@ int im3d2col( ...@@ -259,10 +318,11 @@ int im3d2col(
if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) { if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
err = dilated_im3d2col_kernel_scall( err = dilated_im3d2col_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_im, data_im_offset, height, width, depth, num_kernels, data_im->data, data_im->offset,
data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d, kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
width_col, depth_col, data_col); width_col, depth_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im3d2col_kernel: %s.", "gpuarray error: dilated_im3d2col_kernel: %s.",
...@@ -271,10 +331,11 @@ int im3d2col( ...@@ -271,10 +331,11 @@ int im3d2col(
} else { } else {
err = im3d2col_kernel_scall( err = im3d2col_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_im, data_im_offset, height, width, depth, num_kernels, data_im->data, data_im->offset,
data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d, kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col, stride_h, stride_w, stride_d, height_col, width_col, depth_col,
data_col); data_col->data, data_col->offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im3d2col_kernel: %s.", "gpuarray error: im3d2col_kernel: %s.",
...@@ -284,13 +345,13 @@ int im3d2col( ...@@ -284,13 +345,13 @@ int im3d2col(
return err; return err;
} }
int col2im3d(gpudata * data_col, const size_t channels, int col2im3d(GpuArray *data_col, const size_t channels,
const size_t height, const size_t width, const size_t depth, const size_t height, const size_t width, const size_t depth,
const size_t patch_h, const size_t patch_w, const size_t patch_d, const size_t patch_h, const size_t patch_w, const size_t patch_d,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_d, const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
const size_t pad_h, const size_t pad_w, const size_t pad_d, const size_t pad_h, const size_t pad_w, const size_t pad_d,
const size_t stride_h, const size_t stride_w, const size_t stride_d, const size_t stride_h, const size_t stride_w, const size_t stride_d,
gpudata * data_im, const size_t data_im_offset) { GpuArray *data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1; size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1; size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t dil_patch_d = (patch_d - 1) * dilation_d + 1; size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
...@@ -304,10 +365,11 @@ int col2im3d(gpudata * data_col, const size_t channels, ...@@ -304,10 +365,11 @@ int col2im3d(gpudata * data_col, const size_t channels,
if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) { if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
err = dilated_col2im3d_kernel_scall( err = dilated_col2im3d_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, num_kernels, data_col->data, data_col->offset,
height, width, depth, channels, patch_h, patch_w,
patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d, patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col, stride_h, stride_w, stride_d, height_col, width_col, depth_col,
data_im, data_im_offset); data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im3d_kernel: %s.", "gpuarray error: dilated_col2im3d_kernel: %s.",
...@@ -317,9 +379,11 @@ int col2im3d(gpudata * data_col, const size_t channels, ...@@ -317,9 +379,11 @@ int col2im3d(gpudata * data_col, const size_t channels,
else{ else{
err = col2im3d_kernel_scall( err = col2im3d_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, num_kernels, data_col->data, data_col->offset,
height, width, depth, channels, patch_h, patch_w,
patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col, data_im, data_im_offset); height_col, width_col, depth_col,
data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im3d_kernel: %s.", "gpuarray error: col2im3d_kernel: %s.",
...@@ -503,42 +567,20 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -503,42 +567,20 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col // First, im3d2col
err = im3d2col( err = im3d2col(
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, &bottom->ga, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, col->ga.data); padH, padW, padD, dH, dW, dD, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
switch (col->ga.typecode) { err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
case GA_FLOAT: N_, M_, K_, 1,
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, &col->ga, 0, N_,
N_, M_, K_, 1, &weight->ga, 0, K_,
col->ga.data, 0, N_, 0,
weight->ga.data, 0, K_, &top->ga, n * top_stride, N_);
0,
top->ga.data, n * top_stride, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM forward encountered an error running gemm."); "GpuCorr3dMM forward encountered an error running gemm.");
...@@ -565,9 +607,9 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -565,9 +607,9 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im3d2col // First, im3d2col
err = im3d2col( err = im3d2col(
bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, &bottom->ga, n * bottom_stride, nChannels, bottomHeight,
bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD, bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
padH, padW, padD, dH, dW, dD, col->ga.data); padH, padW, padD, dH, dW, dD, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
...@@ -576,34 +618,12 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -576,34 +618,12 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
switch (col->ga.typecode) { err = rgemm(cb_fortran, cb_trans, cb_no_trans,
case GA_FLOAT: K_, M_, N_, 1,
err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, &col->ga, 0, N_,
K_, M_, N_, 1, &top->ga, n * top_stride, N_,
col->ga.data, 0, N_, (n == 0) ? 0 : 1,
top->ga.data, n * top_stride, N_, &weight->ga, 0, K_);
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad weights encountered an error running gemm."); "GpuCorr3dMM grad weights encountered an error running gemm.");
...@@ -638,34 +658,12 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -638,34 +658,12 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
switch (top->ga.typecode) { err = rgemm(cb_fortran, cb_no_trans, cb_trans,
case GA_FLOAT: N_, K_, M_, 1,
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, &top->ga, n * top_stride, N_,
N_, K_, M_, 1, &weight->ga, 0, K_,
top->ga.data, n * top_stride, N_, 0,
weight->ga.data, 0, K_, &col->ga, 0, N_);
0,
col->ga.data, 0, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
0,
col->ga.data, 0, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
0,
col->ga.data, 0, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad inputs encountered an error running gemm."); "GpuCorr3dMM grad inputs encountered an error running gemm.");
...@@ -673,10 +671,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom, ...@@ -673,10 +671,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// col2im3d back to the data // col2im3d back to the data
err = col2im3d(col->ga.data, nChannels, err = col2im3d(&col->ga, nChannels,
bottomHeight, bottomWidth, bottomDepth, bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
dH, dW, dD, bottom->ga.data, n * bottom_stride); dH, dW, dD, &bottom->ga, n * bottom_stride);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
......
#section kernels #section kernels
#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, * : #kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// TODO check kernel flags // TODO check kernel flags
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/); // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of // sources are clearly marked. Below we reproduce the original license of
...@@ -35,14 +35,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ...@@ -35,14 +35,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
KERNEL void dilated_im2col_kernel(const ga_size n, KERNEL void dilated_im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset, const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_h, const ga_size dilation_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -70,16 +76,23 @@ KERNEL void dilated_im2col_kernel(const ga_size n, ...@@ -70,16 +76,23 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
} }
} }
#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * : #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void im2col_kernel(const ga_size n, KERNEL void im2col_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_im, GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset, const ga_size data_im_offset,
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
const ga_size height, const ga_size width, const ga_size height, const ga_size width,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_col) { GLOBAL_MEM DTYPE_INPUT_0 * data_col,
const ga_size offset_col) {
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -108,9 +121,9 @@ KERNEL void im2col_kernel(const ga_size n, ...@@ -108,9 +121,9 @@ KERNEL void im2col_kernel(const ga_size n,
} }
// GPU kernel for the case of dilation // GPU kernel for the case of dilation
#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
KERNEL void dilated_col2im_kernel(const ga_size n, KERNEL void dilated_col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_h, const ga_size dilation_w,
...@@ -118,7 +131,12 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -118,7 +131,12 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -153,16 +171,22 @@ KERNEL void dilated_col2im_kernel(const ga_size n, ...@@ -153,16 +171,22 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
} }
} }
#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size : #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
KERNEL void col2im_kernel(const ga_size n, KERNEL void col2im_kernel(const ga_size n,
GLOBAL_MEM const DTYPE_INPUT_0 * data_col, GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
const ga_size height, const ga_size width, const ga_size channels, const ga_size height, const ga_size width, const ga_size channels,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size height_col, const ga_size width_col, const ga_size height_col, const ga_size width_col,
GLOBAL_MEM DTYPE_INPUT_0 * data_im, GLOBAL_MEM DTYPE_INPUT_0 * data_im,
const ga_size offset_im,
const ga_size data_im_offset) { const ga_size data_im_offset) {
// offset_im is the pointer offset for data_im.
// data_im_offset is an offset of elements in the array
data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < (n); index += LDIM_0 * GDIM_0) { index < (n); index += LDIM_0 * GDIM_0) {
...@@ -191,16 +215,48 @@ KERNEL void col2im_kernel(const ga_size n, ...@@ -191,16 +215,48 @@ KERNEL void col2im_kernel(const ga_size n,
} }
} }
#section support_code
int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
size_t M, size_t N, size_t K, double alpha,
GpuArray *A, size_t offA, size_t lda,
GpuArray *B, size_t offB, size_t ldb,
double beta, GpuArray *C, size_t offC, size_t ldc) {
switch (A->typecode) {
case GA_FLOAT:
return gpublas_sgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 4) + offA, lda,
B->data, (B->offset / 4) + offB, ldb,
beta,
C->data, (C->offset / 4) + offC, ldc);
case GA_DOUBLE:
return gpublas_dgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 8) + offA, lda,
B->data, (B->offset / 8) + offB, ldb,
beta,
C->data, (C->offset / 8) + offC, ldc);
case GA_HALF:
return gpublas_hgemm(o, tA, tB,
M, N, K, alpha,
A->data, (A->offset / 2) + offA, lda,
B->data, (B->offset / 2) + offB, ldb,
beta,
C->data, (C->offset / 2) + offC, ldc);
default:
return GA_UNSUPPORTED_ERROR;
}
}
#section support_code_struct #section support_code_struct
int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels, int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t pad_h, const size_t pad_w,
const size_t stride_h, const size_t stride_w, const size_t stride_h, const size_t stride_w,
gpudata * data_col) { GpuArray *data_col) {
// We are going to launch channels * height_col * width_col kernels, each // We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid. // kernel responsible for copying a single-channel grid.
size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1; size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
...@@ -212,9 +268,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels, ...@@ -212,9 +268,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
if (dilation_h != 1 || dilation_w != 1) { if (dilation_h != 1 || dilation_w != 1) {
err = dilated_im2col_kernel_scall( err = dilated_im2col_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w, num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col); width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_im2col_kernel: %s.", "gpuarray error: dilated_im2col_kernel: %s.",
...@@ -223,9 +280,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels, ...@@ -223,9 +280,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
} else { } else {
err = im2col_kernel_scall( err = im2col_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w, num_kernels, data_im->data, data_im->offset, data_im_offset,
height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col); width_col, data_col->data, data_col->offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: im2col_kernel: %s.", "gpuarray error: im2col_kernel: %s.",
...@@ -235,11 +293,11 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels, ...@@ -235,11 +293,11 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
return err; return err;
} }
int col2im(gpudata * data_col, const size_t channels, int col2im(GpuArray *data_col, const size_t channels,
const size_t height, const size_t width, const size_t patch_h, const size_t patch_w, const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
const size_t dilation_h, const size_t dilation_w, const size_t dilation_h, const size_t dilation_w,
const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t pad_h, const size_t pad_w, const size_t stride_h,
const size_t stride_w, gpudata * data_im, const size_t data_im_offset) { const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
size_t dil_patch_h = (patch_h - 1) * dilation_h + 1; size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
size_t dil_patch_w = (patch_w - 1) * dilation_w + 1; size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1; size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
...@@ -251,9 +309,10 @@ int col2im(gpudata * data_col, const size_t channels, ...@@ -251,9 +309,10 @@ int col2im(gpudata * data_col, const size_t channels,
if (dilation_h != 1 || dilation_w != 1) { if (dilation_h != 1 || dilation_w != 1) {
err = dilated_col2im_kernel_scall( err = dilated_col2im_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w, num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset); height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: dilated_col2im_kernel: %s.", "gpuarray error: dilated_col2im_kernel: %s.",
...@@ -262,9 +321,10 @@ int col2im(gpudata * data_col, const size_t channels, ...@@ -262,9 +321,10 @@ int col2im(gpudata * data_col, const size_t channels,
} else { } else {
err = col2im_kernel_scall( err = col2im_kernel_scall(
1, &num_kernels, 0, 1, &num_kernels, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w, num_kernels, data_col->data, data_col->offset,
height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset); height_col, width_col, data_im->data, data_im->offset, data_im_offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: col2im_kernel: %s.", "gpuarray error: col2im_kernel: %s.",
...@@ -433,43 +493,21 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -433,43 +493,21 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(bottom->ga.data, n * bottom_stride, err = im2col(&bottom->ga, n * bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data); padH, padW, dH, dW, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
} }
// Second, gemm // Second, gemm
switch (col->ga.typecode) { err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
case GA_FLOAT: N_, M_, K_, 1,
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, &col->ga, 0, N_,
N_, M_, K_, 1, &weight->ga, 0, K_,
col->ga.data, 0, N_, 0,
weight->ga.data, 0, K_, &top->ga, n * top_stride, N_);
0,
top->ga.data, n * top_stride, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
N_, M_, K_, 1,
col->ga.data, 0, N_,
weight->ga.data, 0, K_,
0,
top->ga.data, n * top_stride, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM forward encountered an error running gemm: %d", err); "GpuCorrMM forward encountered an error running gemm: %d", err);
...@@ -495,10 +533,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -495,10 +533,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// First, im2col // First, im2col
err = im2col(bottom->ga.data, n * bottom_stride, err = im2col(&bottom->ga, n * bottom_stride,
nChannels, bottomHeight, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW, bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->ga.data); padH, padW, dH, dW, &col->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
...@@ -507,34 +545,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -507,34 +545,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// Note that we accumulate into weight. We do so by setting beta = 0 // Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This // for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.) // is faster than setting weight to all zeros before the loop.)
switch (col->ga.typecode) { err = rgemm(cb_fortran, cb_trans, cb_no_trans,
case GA_FLOAT: K_, M_, N_, 1,
err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, &col->ga, 0, N_,
K_, M_, N_, 1, &top->ga, n * top_stride, N_,
col->ga.data, 0, N_, (n == 0) ? 0 : 1,
top->ga.data, n * top_stride, N_, &weight->ga, 0, K_);
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
K_, M_, N_, 1,
col->ga.data, 0, N_,
top->ga.data, n * top_stride, N_,
(n == 0) ? 0 : 1,
weight->ga.data, 0, K_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad weights encountered an error running gemm: %d", err); "GpuCorrMM grad weights encountered an error running gemm: %d", err);
...@@ -559,35 +575,13 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -559,35 +575,13 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
// full convolution: gemm, then col2im // full convolution: gemm, then col2im
// Iterate over batch // Iterate over batch
for (size_t n = 0; n < batchSize; n++) { for (size_t n = 0; n < batchSize; n++) {
// gemm into columns // gemm into columns
switch (top->ga.typecode) { err = rgemm(cb_fortran, cb_no_trans, cb_trans,
case GA_FLOAT: N_, K_, M_, 1,
err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, &top->ga, n * top_stride, N_,
N_, K_, M_, 1, &weight->ga, 0, K_,
top->ga.data, n * top_stride, N_, 0,
weight->ga.data, 0, K_, &col->ga, 0, N_);
0,
col->ga.data, 0, N_);
break;
case GA_DOUBLE:
err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
0,
col->ga.data, 0, N_);
break;
case GA_HALF:
err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
N_, K_, M_, 1,
top->ga.data, n * top_stride, N_,
weight->ga.data, 0, K_,
0,
col->ga.data, 0, N_);
break;
default:
err = GA_UNSUPPORTED_ERROR;
}
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad inputs encountered an error running gemm: %d", err); "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
...@@ -595,9 +589,9 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, ...@@ -595,9 +589,9 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
return NULL; return NULL;
} }
// col2im back to the data // col2im back to the data
err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth, err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW, kH, kW, dilH, dilW, padH, padW,
dH, dW, bottom->ga.data, n * bottom_stride); dH, dW, &bottom->ga, n * bottom_stride);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
Py_DECREF(col); Py_DECREF(col);
return NULL; return NULL;
......
...@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode) return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
...@@ -69,11 +69,9 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -69,11 +69,9 @@ class GpuCumOp(GpuKernelBase, Op):
code = """ code = """
KERNEL void %(kname)s(float* input, ga_size input_offset, KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset, float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize inputStrides_y, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z,
ga_ssize inputStrides_z, const int offsetY, const int offsetZ,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){ const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset); input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset); output = (float *)(((char *)output) + output_offset);
...@@ -216,6 +214,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -216,6 +214,7 @@ class GpuCumOp(GpuKernelBase, Op):
output = (float *)(((char *)output) + output_offset); output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
...@@ -397,23 +396,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -397,23 +396,8 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block. size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float); size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset), int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset);
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset)
};
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed"); PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
return -1; return -1;
...@@ -429,18 +413,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -429,18 +413,8 @@ class GpuCumOp(GpuKernelBase, Op):
// report partial cum ops of previous blocks to subsequents ones. // report partial cum ops of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) &(output->ga.offset), int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ
};
int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed"); PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
return -1; return -1;
...@@ -450,24 +424,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -450,24 +424,8 @@ class GpuCumOp(GpuKernelBase, Op):
if (shape[axis] != nbElementsPerCumOp){ if (shape[axis] != nbElementsPerCumOp){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1}; size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1; int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1);
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) &(tmp0),
(void*) &(tmp1)
};
int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "cumadd call failed"); PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
return -1; return -1;
......
...@@ -71,16 +71,22 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -71,16 +71,22 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
KERNEL void k_multi_warp_multinomial( KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi, const ga_size nb_multi,
const ga_size nb_outcomes, const ga_size nb_outcomes,
GLOBAL_MEM %(in_ctype)s * global_pvals, GLOBAL_MEM %(in_ctype)s *global_pvals,
const ga_size global_pvals_offset,
const ga_ssize pvals_row_stride, const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride, const ga_ssize pvals_col_stride,
GLOBAL_MEM %(in_ctype)s * global_unis, GLOBAL_MEM %(in_ctype)s *global_unis,
const ga_size global_unis_offset,
const ga_ssize unis_stride, const ga_ssize unis_stride,
GLOBAL_MEM %(out_ctype)s * global_outs, GLOBAL_MEM %(out_ctype)s *global_outs,
const ga_size global_outs_offset,
const ga_ssize outs_row_stride, const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride const ga_ssize outs_col_stride
) )
{ {
global_pvals = (GLOBAL_MEM %(in_ctype)s *)(((char *)global_pvals) + global_pvals_offset);
global_unis = (GLOBAL_MEM %(in_ctype)s *)(((char *)global_unis) + global_unis_offset);
global_outs = (GLOBAL_MEM %(out_ctype)s *)(((char *)global_outs) + global_outs_offset);
// each thread takes care of one multinomial draw // each thread takes care of one multinomial draw
int n = LDIM_0*GID_0 + LID_0; int n = LDIM_0*GID_0 + LID_0;
if (n < nb_multi) if (n < nb_multi)
...@@ -113,11 +119,14 @@ KERNEL void k_multi_warp_multinomial( ...@@ -113,11 +119,14 @@ KERNEL void k_multi_warp_multinomial(
params=[pygpu.gpuarray.SIZE, params=[pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE, pygpu.gpuarray.SIZE,
pygpu.gpuarray.GpuArray, pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray, pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray, pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE], pygpu.gpuarray.SSIZE],
flags=Kernel.get_flags(node.outputs[0].dtype), flags=Kernel.get_flags(node.outputs[0].dtype),
...@@ -193,27 +202,8 @@ KERNEL void k_multi_warp_multinomial( ...@@ -193,27 +202,8 @@ KERNEL void k_multi_warp_multinomial(
assert(nb_blocks*nb_threads >= nb_multi); assert(nb_blocks*nb_threads >= nb_multi);
void *args[10]; int err = k_multi_warp_multinomial_call(1, &nb_blocks, &nb_threads, 0, PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset, PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s), PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s), PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
ssize_t strides[5] = {
PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s),
PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s),
PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s),
PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s)
};
int err;
args[0] = (void*)&PyGpuArray_DIMS(out)[1];
args[1] = (void*)&PyGpuArray_DIMS(out)[0];
args[2] = pvals->ga.data; //PyGpuArray_DEV_DATA(pvals);
args[3] = (void*)&strides[0];
args[4] = (void*)&strides[1];
args[5] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
args[6] = (void*)&strides[2];
args[7] = out->ga.data; //PyGpuArray_DEV_DATA(out);
args[8] = (void*)&strides[3];
args[9] = (void*)&strides[4];
err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -230,7 +220,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -230,7 +220,7 @@ KERNEL void k_multi_warp_multinomial(
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
class GPUAChoiceFromUniform(GpuKernelBase, Op): class GPUAChoiceFromUniform(GpuKernelBase, Op):
...@@ -295,15 +285,21 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -295,15 +285,21 @@ KERNEL void k_multi_warp_multinomial_wor(
const ga_size nb_outcomes, const ga_size nb_outcomes,
const ga_size n_samples, const ga_size n_samples,
GLOBAL_MEM float * global_pvals_copy, GLOBAL_MEM float * global_pvals_copy,
const ga_size global_pvals_offset,
const ga_ssize pvals_row_stride, const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride, const ga_ssize pvals_col_stride,
GLOBAL_MEM float * global_unis, GLOBAL_MEM float * global_unis,
const ga_size global_unis_offset,
const ga_ssize unis_stride, const ga_ssize unis_stride,
GLOBAL_MEM ga_long * global_outs, GLOBAL_MEM ga_long * global_outs,
const ga_size global_outs_offset,
const ga_ssize outs_row_stride, const ga_ssize outs_row_stride,
const ga_ssize outs_col_stride const ga_ssize outs_col_stride
) )
{ {
global_pvals_copy = (GLOBAL_MEM float *)(((char *)global_pvals_copy) + global_pvals_offset);
global_unis = (GLOBAL_MEM float *)(((char *)global_unis) + global_unis_offset);
global_outs = (GLOBAL_MEM ga_long *)(((char *)global_outs) + global_outs_offset);
// each thread takes care of one multinomial-wor n_samples-draw // each thread takes care of one multinomial-wor n_samples-draw
int n = LDIM_0*GID_0 + LID_0; int n = LDIM_0*GID_0 + LID_0;
...@@ -344,11 +340,14 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -344,11 +340,14 @@ KERNEL void k_multi_warp_multinomial_wor(
pygpu.gpuarray.SIZE, pygpu.gpuarray.SIZE,
pygpu.gpuarray.SIZE, pygpu.gpuarray.SIZE,
pygpu.gpuarray.GpuArray, pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray, pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.GpuArray, pygpu.gpuarray.GpuArray,
pygpu.gpuarray.SIZE,
pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE,
pygpu.gpuarray.SSIZE pygpu.gpuarray.SSIZE
], ],
...@@ -438,28 +437,7 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -438,28 +437,7 @@ KERNEL void k_multi_warp_multinomial_wor(
assert(nb_blocks*nb_threads >= nb_multi); assert(nb_blocks*nb_threads >= nb_multi);
void *args[11]; int err = k_multi_warp_multinomial_wor_call(1, &nb_blocks, &nb_threads, 0, PyGpuArray_DIMS(pvals)[0], PyGpuArray_DIMS(pvals)[1], n_samples, pvals_copy->ga.data, pvals_copy->ga.offset, PyGpuArray_STRIDES(pvals)[0]/sizeof(float), PyGpuArray_STRIDES(pvals)[1]/sizeof(float), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/sizeof(float), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/8, PyGpuArray_STRIDES(out)[1]/8);
ssize_t strides[5] = {
PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
PyGpuArray_STRIDES(unis)[0]/sizeof(float),
PyGpuArray_STRIDES(out)[0]/8,
PyGpuArray_STRIDES(out)[1]/8
};
int err;
args[0] = (void*)&PyGpuArray_DIMS(pvals)[0];
args[1] = (void*)&PyGpuArray_DIMS(pvals)[1];
args[2] = (void*)&n_samples;
args[3] = pvals_copy->ga.data; //PyGpuArray_DEV_DATA(pvals);
args[4] = (void*)&strides[0];
args[5] = (void*)&strides[1];
args[6] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
args[7] = (void*)&strides[2];
args[8] = out->ga.data; //PyGpuArray_DEV_DATA(out);
args[9] = (void*)&strides[3];
args[10] = (void*)&strides[4];
err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -477,7 +455,7 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -477,7 +455,7 @@ KERNEL void k_multi_warp_multinomial_wor(
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (8,)
@register_opt('fast_compile') @register_opt('fast_compile')
......
#section kernels #section kernels
#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, * : #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_kernel(const ga_size nthreads, KERNEL void max_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w, const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -41,18 +43,20 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads, ...@@ -41,18 +43,20 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_kernel(const ga_size nthreads, KERNEL void max_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h, const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -90,17 +94,19 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads, ...@@ -90,17 +94,19 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
} }
} }
#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, * : #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, *, size:
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_kernel(const ga_size nthreads, KERNEL void ave_pool2d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w, const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -143,20 +149,22 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads, ...@@ -143,20 +149,22 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
} }
} }
#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, size, * : #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_kernel(const ga_size nthreads, KERNEL void ave_pool3d_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h, const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
// grid stride looping // grid stride looping
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
index += LDIM_0 * GDIM_0) { index += LDIM_0 * GDIM_0) {
...@@ -273,8 +281,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -273,8 +281,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], x->ga.data, x->ga.offset, w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data); (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool2d_kernel %s.", "GpuPool: max_pool2d_kernel %s.",
...@@ -285,8 +293,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -285,8 +293,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1], x->ga.data, x->ga.offset,
INC_PAD, SUM_MODE, (*z)->ga.data); w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool2d_kernel %s.", "GpuPool: ave_pool2d_kernel %s.",
...@@ -301,8 +311,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -301,8 +311,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], x->ga.data, x->ga.offset, w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data); p[0], p[1], p[2], (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: max_pool3d_kernel %s.", "GpuPool: max_pool3d_kernel %s.",
...@@ -313,9 +323,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x, ...@@ -313,9 +323,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2], x->ga.data, x->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], p[0], p[1], p[2],
INC_PAD, SUM_MODE, (*z)->ga.data); INC_PAD, SUM_MODE,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPool: ave_pool3d_kernel %s.", "GpuPool: ave_pool3d_kernel %s.",
......
#section kernels #section kernels
#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, * : #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height, const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width, const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode, const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
GLOBAL_MEM DTYPE_OUTPUT_0 *gx) GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -46,19 +49,22 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads, ...@@ -46,19 +49,22 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
} }
} }
#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, size, size, * : #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads, KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth, const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth, const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx) const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -152,9 +158,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -152,9 +158,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3], z_dims[2], z_dims[3],
x->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE, (*gx)->ga.data); INC_PAD, SUM_MODE,
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.", "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
...@@ -166,10 +174,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x, ...@@ -166,10 +174,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4], z_dims[2], z_dims[3], z_dims[4],
x->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE, p[0], p[1], p[2], INC_PAD, SUM_MODE,
(*gx)->ga.data); (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.", "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
......
#section kernels #section kernels
#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * : #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gz) GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((char *)z) + z_off);
gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((char *)gx) + gx_off);
gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gz) + gz_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -42,18 +46,22 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads, ...@@ -42,18 +46,22 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gz) GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((char *)z) + z_off);
gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((char *)gx) + gx_off);
gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gz) + gz_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -146,9 +154,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -146,9 +154,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, z->ga.data, gx->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gx->ga.data, gx->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
(*gz)->ga.data); (*gz)->ga.data, (*gz)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.", "GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
...@@ -161,9 +171,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x, ...@@ -161,9 +171,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, z->ga.data, gx->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gx->ga.data, gx->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2], w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
(*gz)->ga.data); (*gz)->ga.data, (*gz)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.", "GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
......
#section kernels #section kernels
#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * : #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size height, const ga_size num, const ga_size channels, const ga_size height,
const ga_size width, const ga_size pooled_height, const ga_size pooled_width, const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w, const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx) const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)z) + z_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -38,19 +42,23 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads, ...@@ -38,19 +42,23 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_grad_kernel(const ga_size nthreads, KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size depth, const ga_size num, const ga_size channels, const ga_size depth,
const ga_size height, const ga_size width, const ga_size pooled_depth, const ga_size height, const ga_size width, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *gx) GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
{ {
x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)z) + z_off);
gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((char *)gz) + gz_off);
gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index += LDIM_0 * GDIM_0) { index < nthreads; index += LDIM_0 * GDIM_0) {
...@@ -138,9 +146,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -138,9 +146,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[0], x_dims[1], x_dims[2], x_dims[3],
z_dims[2], z_dims[3], z_dims[2], z_dims[3],
x->ga.data, z->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
(*gx)->ga.data); (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool2d_grad_kernel %s.", "GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
...@@ -152,9 +162,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x, ...@@ -152,9 +162,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4], x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
z_dims[2], z_dims[3], z_dims[4], z_dims[2], z_dims[3], z_dims[4],
x->ga.data, z->ga.data, gz->ga.data, x->ga.data, x->ga.offset,
z->ga.data, z->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*gx)->ga.data); p[0], p[1], p[2], (*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolGrad: max_pool3d_grad_kernel %s.", "GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
......
#section kernels #section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * : #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height, const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width, const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((char *)x) + x_off);
ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((char *)ex) + ex_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -46,19 +49,22 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads, ...@@ -46,19 +49,22 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
} }
} }
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * : #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu) // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads, KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth, const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width, const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width, const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex, GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w, const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_OUTPUT_0 *z) GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size x_off)
{ {
x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((char *)x) + x_off);
ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((char *)ex) + ex_off);
z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((char *)z) + z_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads; index < nthreads;
...@@ -167,9 +173,10 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x, ...@@ -167,9 +173,10 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3], x_dims[2], x_dims[3],
x->ga.data, ex->ga.data, x->ga.data, x->ga.offset,
ex->ga.data, ex->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1], w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data); (*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool2d_rop_kernel %s.", "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
...@@ -182,9 +189,11 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x, ...@@ -182,9 +189,11 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels, err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4], z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4], x_dims[2], x_dims[3], x_dims[4],
x->ga.data, ex->ga.data, x->ga.data, x->ga.offset,
ex->ga.data, ex->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2], w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data); p[0], p[1], p[2],
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool3d_rop_kernel %s.", "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
......
...@@ -75,10 +75,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -75,10 +75,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
code = """ code = """
KERNEL void mrg_uniform( KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data, GLOBAL_MEM %(otype)s *sample_data,
ga_size sample_offset,
GLOBAL_MEM ga_int *state_data, GLOBAL_MEM ga_int *state_data,
ga_size state_offset,
const ga_uint Nsamples, const ga_uint Nsamples,
const ga_uint Nstreams_used) const ga_uint Nstreams_used)
{ {
sample_data = (GLOBAL_MEM %(otype)s *)(((char *)sample_data) + sample_offset);
state_data = (GLOBAL_MEM ga_int *)(((char *)state_data) + state_offset);
/* /*
* The cluda backend makes sure that ga_int corresponds to * The cluda backend makes sure that ga_int corresponds to
* a 32 bit signed type on the target device. It is not a * a 32 bit signed type on the target device. It is not a
...@@ -157,7 +161,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -157,7 +161,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
from pygpu import gpuarray from pygpu import gpuarray
return [Kernel(code=code, name="mrg_uniform", return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray, params=[gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE,
'uint32', 'uint32'], 'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32')) flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
] ]
...@@ -273,7 +278,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -273,7 +278,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
} }
// Make sure we run as many blocks as we need to cover the whole n_streams // Make sure we run as many blocks as we need to cover the whole n_streams
gs = (n_streams + ls - 1)/ls; gs = (n_streams + ls - 1)/ls;
err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_rstate)s->ga.data, n_elements, n_streams); err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_sample)s->ga.offset, %(o_rstate)s->ga.data, %(o_rstate)s->ga.offset, n_elements, n_streams);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n", PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
GpuKernel_error(&%(kname)s, err)); GpuKernel_error(&%(kname)s, err));
...@@ -283,7 +288,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -283,7 +288,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
@register_opt2([mrg_uniform], 'fast_compile') @register_opt2([mrg_uniform], 'fast_compile')
......
#section kernels #section kernels
#kernel eye : *, size, size : #kernel eye : *, size, size, size :
/* The eye name will be used to generate supporting objects. The only /* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also named 'k_' + <the name above> (k_eye in this case). This name also
has to match the kernel function name below. has to match the kernel function name below.
*/ */
KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size n, ga_size m) { KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size a_off, ga_size n, ga_size m) {
a = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)a) + a_off);
ga_size nb = n < m ? n : m; ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) { for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1; a[i*m + i] = 1;
...@@ -37,7 +38,7 @@ int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m, ...@@ -37,7 +38,7 @@ int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
ls = 1; ls = 1;
gs = 256; gs = 256;
/* The eye_call name comes from the kernel declaration above. */ /* The eye_call name comes from the kernel declaration above. */
err = eye_call(1, &gs, &ls, 0, (*z)->ga.data, dims[0], dims[1]); err = eye_call(1, &gs, &ls, 0, (*z)->ga.data, (*z)->ga.offset, dims[0], dims[1]);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kEye: %s. n%lu, m=%lu.", "gpuarray error: kEye: %s. n%lu, m=%lu.",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论