提交 7b5919e9 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5323 from aam-at/max_pool_rop

Pooling rop
...@@ -51,7 +51,7 @@ from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch, ...@@ -51,7 +51,7 @@ from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemv_no_inplace, gpugemv_inplace, gpugemv_no_inplace, gpugemv_inplace,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad, from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad, GpuMaxPoolRop,
GpuDownsampleFactorMaxGradGrad) GpuDownsampleFactorMaxGradGrad)
from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter, from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
gpu_sparse_block_outer, gpu_sparse_block_outer,
...@@ -1747,6 +1747,29 @@ def local_gpu_downsample_factor_max_grad_grad(op, ctx_name, inputs, outputs): ...@@ -1747,6 +1747,29 @@ def local_gpu_downsample_factor_max_grad_grad(op, ctx_name, inputs, outputs):
return unpad_dims(ret_padded, inp, 2, nd) return unpad_dims(ret_padded, inp, 2, nd)
@register_opt()
@op_lifter([pool.MaxPoolRop])
@register_opt2([pool.MaxPoolRop])
def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
assert op.__props__ == ('ignore_border', 'mode', 'ndim')
inp, eval_inp, ws, stride, pad = inputs
nd = op.ndim
if nd not in (2, 3):
return
inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
eval_inp = gpu_contiguous(as_gpuarray_variable(eval_inp, ctx_name))
op = GpuMaxPoolRop(op.ignore_border, op.mode, op.ndim)
if inp.ndim == nd + 2:
return op(inp, eval_inp, ws, stride, pad)
else:
# reshape to 4D or 5D with 2 non-pooling dimensions
inp_padded = pad_dims(inp, 2, nd)
eval_inp_padded = pad_dims(eval_inp, 2, nd)
ret_padded = op(inp_padded, eval_inp_padded, ws, stride, pad)
return unpad_dims(ret_padded, inp, 2, nd)
@register_opt("low_memory") @register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda]) @local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node): def local_gpu_elemwise_careduce(node):
......
...@@ -112,6 +112,26 @@ class GpuPool(CGpuKernelBase): ...@@ -112,6 +112,26 @@ class GpuPool(CGpuKernelBase):
def connection_pattern(self, node): def connection_pattern(self, node):
return [[1], [0], [0], [0]] return [[1], [0], [0], [0]]
def R_op(self, inputs, eval_points):
if self.mode != 'max':
# Rop for average or sum is simply pooling evaluated at eval point
eval_inputs = [eval_points[0]] + inputs[1:]
return [self(*eval_inputs)]
# R_op can receive None as eval_points.
# That mean there is no diferientiable path through that input
# If this imply that you cannot compute some outputs,
# return None for those.
if eval_points[0] is None:
return [None]
z = self(*inputs)
x, ws, stride, pad = inputs
return [
GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode,
self.ndim)(x, z, eval_points[0], ws,
stride, pad)
]
class GpuMaxPoolGrad(CGpuKernelBase): class GpuMaxPoolGrad(CGpuKernelBase):
""" """
...@@ -334,3 +354,72 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase): ...@@ -334,3 +354,72 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
def connection_pattern(self, node): def connection_pattern(self, node):
return [[1], [1], [1], [0], [0], [0]] return [[1], [1], [1], [0], [0], [0]]
class GpuMaxPoolRop(CGpuKernelBase):
"""
Implements the R-operator for the downsample operation.
"""
__props__ = ('ignore_border', 'mode', 'ndim')
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
self.ignore_border = ignore_border
self.mode = mode
CGpuKernelBase.__init__(self, ['pool_max_rop.c'],
'APPLY_SPECIFIC(max_pool_rop)')
assert mode == 'max'
assert ndim in [2, 3]
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def make_node(self, inp, eval_point, ws, stride=None, pad=None):
ctx_name = infer_context_name(inp)
nd = self.ndim
inp = as_gpuarray_variable(inp, ctx_name)
assert (inp.ndim == nd + 2)
eval_point = as_gpuarray_variable(eval_point, ctx_name)
assert (eval_point.ndim == nd + 2)
if stride is None:
stride = ws
if pad is None:
pad = (0,) * nd
elif isinstance(pad, (tuple, list)):
if max(pad) != 0 and not self.ignore_border:
raise ValueError('Padding works only with ignore_border=True')
if isinstance(ws, (tuple, list)):
if any(pad[i] >= ws[i] for i in range(nd)):
raise ValueError('Padding must be smaller than strides')
ws = as_tensor_variable(ws)
stride = as_tensor_variable(stride)
pad = as_tensor_variable(pad)
assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
assert ws.ndim == 1
if not ws.dtype.startswith('int'):
raise TypeError('Window shape parameters must be ints.')
if not stride.dtype.startswith('int'):
raise TypeError('Stride parameters must be ints.')
if not pad.dtype.startswith('int'):
raise TypeError('Padding parameters must be ints.')
return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
ignore_border = int(self.ignore_border)
return [('IGNORE_BORDER', ignore_border)]
def infer_shape(self, node, in_shapes):
ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
pad, self.ndim)
return [shp]
#section kernels
#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * :
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_height,
const ga_size pooled_width, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_h, const ga_size stride_w,
const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size c = (index / pooled_width / pooled_height) % channels;
const ga_size n = (index / pooled_width / pooled_height / channels);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * height * width;
const DTYPE_i0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[hstart*width + wstart];
DTYPE_o0 collector = ex_slice[hstart*width + wstart];
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[h*width + w] > maxval) {
maxval = x_slice[h*width + w];
collector = ex_slice[h*width + w];
}
}
}
z[index] = collector;
}
}
#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * :
// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
const ga_size num, const ga_size channels, const ga_size pooled_depth,
const ga_size pooled_height, const ga_size pooled_width,
const ga_size depth, const ga_size height, const ga_size width,
GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
GLOBAL_MEM DTYPE_o0 *z)
{
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0;
index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size pw = index % pooled_width;
const ga_size ph = (index / pooled_width) % pooled_height;
const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
const ga_size dend = min(dstart + kernel_d, depth);
ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
const ga_size hend = min(hstart + kernel_h, height);
ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
const ga_size wend = min(wstart + kernel_w, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const ga_size offset = (n*channels + c) * depth * height * width;
const DTYPE_i0* x_slice = x + offset;
const DTYPE_i1* ex_slice = ex + offset;
DTYPE_o0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
DTYPE_o0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
for (ga_size d=dstart; d < dend; ++d) {
for (ga_size h=hstart; h < hend; ++h) {
for (ga_size w=wstart; w < wend; ++w) {
// maximum in the region
if (x_slice[(d*height + h)*width + w] > maxval) {
maxval = x_slice[(d*height + h)*width + w];
collector = ex_slice[(d*height + h)*width + w];
}
}
}
}
z[index] = collector;
}
}
#section support_code
// output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<size_t>(0, (in_dim - 1 - ws + st)/st) + 1))
#section support_code_struct
int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
PyGpuArrayObject *ex,
PyArrayObject *ws,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PyGpuContextObject *ctx) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
{
PyErr_Format(PyExc_ValueError,
"GpuMaxPoolRop: requires data to be C-contiguous");
return 1;
}
size_t ndims = PyArray_DIM(ws, 0);
if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
{
PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
return 1;
}
// prepare output
const size_t* x_dims = PyGpuArray_DIMS(x);
size_t z_dims[5]; // avoid warning if use 2 + nd
size_t w[3];
size_t s[3];
size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
int nonzero_padding = 0;
for (int i = 0; i < ndims; i++) {
w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!IGNORE_BORDER && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuMaxPoolRop: padding works only with ignore_border=True");
return 1;
}
if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
ex->ga.typecode, GA_C_ORDER, ctx) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolRop: failed to allocate memory");
return 1;
}
{
// scope for running kernel
int err;
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
x->ga.data, ex->ga.data,
w[0], w[1], s[0], s[1], p[0], p[1],
(*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
x->ga.data, ex->ga.data,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], (*z)->ga.data);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
GpuKernel_error(&k_max_pool2d_rop_kernel, err));
return 1;
}
}
}
return 0;
}
...@@ -133,11 +133,26 @@ def test_pool2d(): ...@@ -133,11 +133,26 @@ def test_pool2d():
assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border) assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test grad grad for max pooling # test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad # for average pooling grad grad is just average pooling grad
if mode != 'max': if mode != 'max':
continue continue
ea = theano.shared(rand(*shp), 'ea')
gr = theano.function([], tensor.Rop(a_pooled, a, ea), mode=gpu_mode)
gr2 = theano.function([], tensor.Rop(a_pooled, a, ea), mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gr.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gr2.maker.fgraph.toposort()
])
assert numpy.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a) ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
gg = theano.function([], ggf, mode=gpu_mode) gg = theano.function([], ggf, mode=gpu_mode)
...@@ -228,11 +243,26 @@ def test_pool3d(): ...@@ -228,11 +243,26 @@ def test_pool3d():
assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border) assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
# test grad grad for max pooling # test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad # for average pooling grad grad is just average pooling grad
if mode != 'max': if mode != 'max':
continue continue
ea = theano.shared(rand(*shp), 'ea')
gr = theano.function([], tensor.Rop(a_pooled, a, ea), mode=gpu_mode)
gr2 = theano.function([], tensor.Rop(a_pooled, a, ea), mode=ref_mode)
assert any([
isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
for node in gr.maker.fgraph.toposort()
])
assert any([
isinstance(node.op, DownsampleFactorMaxGradGrad)
for node in gr2.maker.fgraph.toposort()
])
assert numpy.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a) ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
gg = theano.function([], ggf, mode=gpu_mode) gg = theano.function([], ggf, mode=gpu_mode)
......
...@@ -17,10 +17,12 @@ from theano.tests import unittest_tools as utt ...@@ -17,10 +17,12 @@ from theano.tests import unittest_tools as utt
from theano import function from theano import function
import theano import theano
from theano import tensor from theano import tensor
import itertools
import numpy import numpy
from theano.gof import Op, Apply from theano.gof import Op, Apply
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
from theano.tensor.signal.pool import Pool
from theano.tensor.nnet import conv, conv2d from theano.tensor.nnet import conv, conv2d
''' '''
...@@ -255,6 +257,47 @@ class test_RopLop(RopLop_checker): ...@@ -255,6 +257,47 @@ class test_RopLop(RopLop_checker):
self.x[:4].dimshuffle('x', 0), 0).sum(axis=1), self.x[:4].dimshuffle('x', 0), 0).sum(axis=1),
(1,)) (1,))
def test_downsample(self):
rng = numpy.random.RandomState(utt.fetch_seed())
# ws, shp
examples = (
((2,), (16,)),
((2,), (4, 16,)),
((2,), (4, 2, 16,)),
((1, 1), (4, 2, 16, 16)),
((2, 2), (4, 2, 16, 16)),
((3, 3), (4, 2, 16, 16)),
((3, 2), (4, 2, 16, 16)),
((3, 2, 2), (3, 2, 16, 16, 16)),
((2, 3, 2), (3, 2, 16, 16, 16)),
((2, 2, 3), (3, 2, 16, 16, 16)),
((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)),
)
for example, ignore_border in itertools.product(examples, [True, False]):
(ws, shp) = example
vx = rng.rand(*shp)
vex = rng.rand(*shp)
x = theano.shared(vx)
ex = theano.shared(vex)
maxpool_op = Pool(ignore_border, ndim=len(ws))
a_pooled = maxpool_op(x, ws).flatten()
yv = tensor.Rop(a_pooled, x, ex)
mode = None
if theano.config.mode == "FAST_COMPILE":
mode = "FAST_RUN"
rop_f = function([], yv, on_unused_input='ignore', mode=mode)
sy, _ = theano.scan(lambda i, y, x, v:
(tensor.grad(y[i], x) * v).sum(),
sequences=tensor.arange(a_pooled.shape[0]),
non_sequences=[a_pooled, x, ex])
scan_f = function([], sy, on_unused_input='ignore', mode=mode)
v1 = rop_f()
v2 = scan_f()
assert numpy.allclose(v1, v2), ("Rop mismatch: %s %s" % (v1, v2))
def test_conv(self): def test_conv(self):
for conv_op in [conv.conv2d, conv2d]: for conv_op in [conv.conv2d, conv2d]:
for border_mode in ['valid', 'full']: for border_mode in ['valid', 'full']:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论