提交 9866ce20 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Implements gpu max pooling rop for 2d and 3d inputs

Simply reuses gpu max pooling where instead we collect eval point with index which corresponds to the maximum in the input.
上级 c38f534f
...@@ -112,6 +112,22 @@ class GpuPool(CGpuKernelBase): ...@@ -112,6 +112,22 @@ class GpuPool(CGpuKernelBase):
def connection_pattern(self, node): def connection_pattern(self, node):
return [[1], [0], [0], [0]] return [[1], [0], [0], [0]]
def R_op(self, inputs, eval_points):
if self.mode != 'max':
# Rop for average or sum is simply pooling evaluated at eval point
eval_inputs = [eval_points[0]] + inputs[1:]
return [self(*eval_inputs)]
# R_op can receive None as eval_points.
# That mean there is no diferientiable path through that input
# If this imply that you cannot compute some outputs,
# return None for those.
if eval_points[0] is None:
return [None]
x, ws, stride, pad = inputs
rop = GpuMaxPoolRop(ignore_border=self.ignore_border)
return [rop(x, eval_points[0], ws, stride=stride, pad=pad)]
class GpuMaxPoolGrad(CGpuKernelBase): class GpuMaxPoolGrad(CGpuKernelBase):
""" """
...@@ -334,3 +350,72 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase): ...@@ -334,3 +350,72 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
def connection_pattern(self, node): def connection_pattern(self, node):
return [[1], [1], [1], [0], [0], [0]] return [[1], [1], [1], [0], [0], [0]]
class GpuMaxPoolRop(CGpuKernelBase):
"""
Implements the R-operator for the downsample operation.
"""
__props__ = ('ignore_border', 'mode', 'ndim')
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(self, ['pool_max_rop.c'],
'APPLY_SPECIFIC(max_pool_rop)')
assert mode == 'max'
assert ndim in [2, 3]
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def make_node(self, inp, eval_point, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert (inp.ndim == nd + 2)
eval_point = as_gpuarray_variable(eval_point, ctx_name)
assert (eval_point.ndim == nd + 2)
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
elif isinstance(pad, (tuple, list)):
if max(pad) != 0 and not self.ignore_border:
raise ValueError('Padding works only with ignore_border=True')
if isinstance(ws, (tuple, list)):
if any(pad[i] >= ws[i] for i in range(nd)):
raise ValueError('Padding must be smaller than strides')
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if not ws.dtype.startswith('int'):
raise TypeError('Window shape parameters must be ints.')
if not stride.dtype.startswith('int'):
raise TypeError('Stride parameters must be ints.')
if not pad.dtype.startswith('int'):
raise TypeError('Padding parameters must be ints.')
return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
ignore_border = int(self.ignore_border)
return [('IGNORE_BORDER', ignore_border)]
def infer_shape(self, node, in_shapes):
ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
pad, self.ndim)
return [shp]
#section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[hstart*width + wstart];
DTYPE_o0 collector = ex_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[h*width + w] > maxval) {
maxval = x_slice[h*width + w];
collector = ex_slice[h*width + w];
}
}
}
z[index] = collector;
}
}
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_o0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[(d*height + h)*width + w] > maxval) {
maxval = x_slice[(d*height + h)*width + w];
collector = ex_slice[(d*height + h)*width + w];
}
}
}
}
z[index] = collector;
}
}
#section support_code
// output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<size_t>(0, (in_dim - 1 - ws + st)/st) + 1))
#section support_code_struct
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
PyGpuArrayObject *ex,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolRop: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
return 1;
}
// prepare output
const size_t* x_dims = PyGpuArray_DIMS(x);
size_t z_dims[5]; // avoid warning if use 2 + nd
size_t w[3];
size_t s[3];
size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
int nonzero_padding = 0;
for (int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!IGNORE_BORDER && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuMaxPoolRop: padding works only with ignore_border=True");
return 1;
}
if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
ex->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolRop: failed to allocate memory");
return 1;
}
{
// scope for running kernel
int err;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, ex->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, ex->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
}
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论