提交 7b5919e9 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5323 from aam-at/max_pool_rop

Pooling rop
......@@ -51,7 +51,7 @@ from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemv_no_inplace, gpugemv_inplace,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad,
from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad, GpuMaxPoolRop,
GpuDownsampleFactorMaxGradGrad)
from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
gpu_sparse_block_outer,
......@@ -1747,6 +1747,29 @@ def local_gpu_downsample_factor_max_grad_grad(op, ctx_name, inputs, outputs):
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt()
@op_lifter([pool.MaxPoolRop])
@register_opt2([pool.MaxPoolRop])
def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
assert op.__props__ == ('ignore_border', 'mode', 'ndim')
inp, eval_inp, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
eval_inp = gpu_contiguous(as_gpuarray_variable(eval_inp, ctx_name))
op = GpuMaxPoolRop(op.ignore_border, op.mode, op.ndim)
if inp.ndim == nd + 2:
return op(inp, eval_inp, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
eval_inp_padded = pad_dims(eval_inp, 2, nd)
ret_padded = op(inp_padded, eval_inp_padded, ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node):
......
......@@ -112,6 +112,26 @@ class GpuPool(CGpuKernelBase):
def connection_pattern(self, node):
return [[1], [0], [0], [0]]
def R_op(self, inputs, eval_points):
if self.mode != 'max':
# Rop for average or sum is simply pooling evaluated at eval point
eval_inputs = [eval_points[0]] + inputs[1:]
return [self(*eval_inputs)]
# R_op can receive None as eval_points.
# That mean there is no diferientiable path through that input
# If this imply that you cannot compute some outputs,
# return None for those.
if eval_points[0] is None:
return [None]
z = self(*inputs)
x, ws, stride, pad = inputs
return [
GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode,
self.ndim)(x, z, eval_points[0], ws,
stride, pad)
]
class GpuMaxPoolGrad(CGpuKernelBase):
"""
......@@ -334,3 +354,72 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
def connection_pattern(self, node):
return [[1], [1], [1], [0], [0], [0]]
class GpuMaxPoolRop(CGpuKernelBase):
"""
Implements the R-operator for the downsample operation.
"""
__props__ = ('ignore_border', 'mode', 'ndim')
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(self, ['pool_max_rop.c'],
'APPLY_SPECIFIC(max_pool_rop)')
assert mode == 'max'
assert ndim in [2, 3]
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def make_node(self, inp, eval_point, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert (inp.ndim == nd + 2)
eval_point = as_gpuarray_variable(eval_point, ctx_name)
assert (eval_point.ndim == nd + 2)
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
elif isinstance(pad, (tuple, list)):
if max(pad) != 0 and not self.ignore_border:
raise ValueError('Padding works only with ignore_border=True')
if isinstance(ws, (tuple, list)):
if any(pad[i] >= ws[i] for i in range(nd)):
raise ValueError('Padding must be smaller than strides')
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if not ws.dtype.startswith('int'):
raise TypeError('Window shape parameters must be ints.')
if not stride.dtype.startswith('int'):
raise TypeError('Stride parameters must be ints.')
if not pad.dtype.startswith('int'):
raise TypeError('Padding parameters must be ints.')
return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
ignore_border = int(self.ignore_border)
return [('IGNORE_BORDER', ignore_border)]
def infer_shape(self, node, in_shapes):
ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
pad, self.ndim)
return [shp]
#section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[hstart*width + wstart];
DTYPE_o0 collector = ex_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[h*width + w] > maxval) {
maxval = x_slice[h*width + w];
collector = ex_slice[h*width + w];
}
}
}
z[index] = collector;
}
}
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_o0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[(d*height + h)*width + w] > maxval) {
maxval = x_slice[(d*height + h)*width + w];
collector = ex_slice[(d*height + h)*width + w];
}
}
}
}
z[index] = collector;
}
}
#section support_code
// output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<size_t>(0, (in_dim - 1 - ws + st)/st) + 1))
#section support_code_struct
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
PyGpuArrayObject *ex,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolRop: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
return 1;
}
// prepare output
const size_t* x_dims = PyGpuArray_DIMS(x);
size_t z_dims[5]; // avoid warning if use 2 + nd
size_t w[3];
size_t s[3];
size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
int nonzero_padding = 0;
for (int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!IGNORE_BORDER && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuMaxPoolRop: padding works only with ignore_border=True");
return 1;
}
if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
ex->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolRop: failed to allocate memory");
return 1;
}
{
// scope for running kernel
int err;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, ex->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, ex->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
}
return 0;
}
......@@ -133,11 +133,26 @@ def test_pool2d():
assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test grad grad for max pooling
# test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if mode != 'max':
continue
ea = theano.shared(rand(*shp), 'ea')
gr = theano.function([], tensor.Rop(a_pooled, a, ea), mode=gpu_mode)
gr2 = theano.function([], tensor.Rop(a_pooled, a, ea), mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gr.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gr2.maker.fgraph.toposort()
])
assert numpy.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
gg = theano.function([], ggf, mode=gpu_mode)
......@@ -228,11 +243,26 @@ def test_pool3d():
assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test grad grad for max pooling
# test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if mode != 'max':
continue
ea = theano.shared(rand(*shp), 'ea')
gr = theano.function([], tensor.Rop(a_pooled, a, ea), mode=gpu_mode)
gr2 = theano.function([], tensor.Rop(a_pooled, a, ea), mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gr.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gr2.maker.fgraph.toposort()
])
assert numpy.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
gg = theano.function([], ggf, mode=gpu_mode)
......
......@@ -17,10 +17,12 @@ from theano.tests import unittest_tools as utt
from theano import function
import theano
from theano import tensor
import itertools
import numpy
from theano.gof import Op, Apply
from theano.gradient import grad_undefined
from theano.tests.unittest_tools import SkipTest
from theano.tensor.signal.pool import Pool
from theano.tensor.nnet import conv, conv2d
'''
......@@ -255,6 +257,47 @@ class test_RopLop(RopLop_checker):
self.x[:4].dimshuffle('x', 0), 0).sum(axis=1),
(1,))
def test_downsample(self):
rng = numpy.random.RandomState(utt.fetch_seed())
# ws, shp
examples = (
((2,), (16,)),
((2,), (4, 16,)),
((2,), (4, 2, 16,)),
((1, 1), (4, 2, 16, 16)),
((2, 2), (4, 2, 16, 16)),
((3, 3), (4, 2, 16, 16)),
((3, 2), (4, 2, 16, 16)),
((3, 2, 2), (3, 2, 16, 16, 16)),
((2, 3, 2), (3, 2, 16, 16, 16)),
((2, 2, 3), (3, 2, 16, 16, 16)),
((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)),
)
for example, ignore_border in itertools.product(examples, [True, False]):
(ws, shp) = example
vx = rng.rand(*shp)
vex = rng.rand(*shp)
x = theano.shared(vx)
ex = theano.shared(vex)
maxpool_op = Pool(ignore_border, ndim=len(ws))
a_pooled = maxpool_op(x, ws).flatten()
yv = tensor.Rop(a_pooled, x, ex)
mode = None
if theano.config.mode == "FAST_COMPILE":
mode = "FAST_RUN"
rop_f = function([], yv, on_unused_input='ignore', mode=mode)
sy, _ = theano.scan(lambda i, y, x, v:
(tensor.grad(y[i], x) * v).sum(),
sequences=tensor.arange(a_pooled.shape[0]),
non_sequences=[a_pooled, x, ex])
scan_f = function([], sy, on_unused_input='ignore', mode=mode)
v1 = rop_f()
v2 = scan_f()
assert numpy.allclose(v1, v2), ("Rop mismatch: %s %s" % (v1, v2))
def test_conv(self):
for conv_op in [conv.conv2d, conv2d]:
for border_mode in ['valid', 'full']:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论