Merge pull request #2543 from yaoli/pool_pad

support max pooling with padding

Merge pull request #2543 from yaoli/pool_pad
471a1711 · Pascal Lamblin · 787133a7 · 38da1f3a · 471a1711 · 471a1711
--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -4,7 +4,7 @@ Planned:
 DownsampleFactorMax, DownsampleAvg, DownsampleSoftmax.
 """
-#This file should move along with conv.py
+# This file should move along with conv.py
 import __builtin__
 import numpy
@@ -19,7 +19,7 @@ def max_pool2D(*args, **kwargs):
    return max_pool_2d(*args, **kwargs)
-def max_pool_2d(input, ds, ignore_border=False, st=None):
+def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0)):
    """
    Takes as input a N-D tensor, where N >= 2. It downscales the input image by
    the specified factor, by keeping only the maximum value of non-overlapping
@@ -39,6 +39,10 @@ def max_pool_2d(input, ds, ignore_border=False, st=None):
        over rows/cols to get the the next pool region.
        if st is None, it is considered equal to ds
        (no overlap on pooling regions)
+    :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
+            of the images, pad_h is the size of the top and bottom margins,
+            and pad_w is the size of the left and right margins.
+    :type padding: tuple of two ints
    """
    if input.ndim < 2:
@@ -62,7 +66,7 @@ def max_pool_2d(input, ds, ignore_border=False, st=None):
    input_4D = tensor.reshape(input, new_shape, ndim=4)
    # downsample mini-batch of images
-    op = DownsampleFactorMax(ds, ignore_border, st=st)
+    op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding)
    output = op(input_4D)
    # restore to original shape
@@ -77,10 +81,10 @@ class DownsampleFactorMax(Op):
    regions.
    """
-    __props__ = ('ds', 'ignore_border', 'st')
+    __props__ = ('ds', 'ignore_border', 'st', 'padding')
    @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None):
+    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
        """Return the shape of the output from this op, for input of given
        shape and flags.
@@ -101,6 +105,11 @@ class DownsampleFactorMax(Op):
            extra row/col of partial downsampling (False) or ignore it (True).
        :type ignore_border: bool
+        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
+            of the images, pad_h is the size of the top and bottom margins,
+            and pad_w is the size of the left and right margins.
+        :type padding: tuple of two ints
        :rtype: list
        :returns: the shape of the output from this op, for input of given
            shape.  This will have the same length as imgshape, but with last
@@ -113,6 +122,8 @@ class DownsampleFactorMax(Op):
        if st is None:
            st = ds
        r, c = imgshape[-2:]
+        r += padding[0] * 2
+        c += padding[1] * 2
        if ignore_border:
            out_r = (r - ds[0]) // st[0] + 1
@@ -149,7 +160,7 @@ class DownsampleFactorMax(Op):
        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
-    def __init__(self, ds, ignore_border=False, st=None):
+    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0)):
        """
        :param ds: downsample factor over rows and column.
                   ds indicates the pool region size.
@@ -166,6 +177,11 @@ class DownsampleFactorMax(Op):
            (no overlap on pooling regions)
        : type st: list or tuple of two ints
+        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
+            of the images, pad_h is the size of the top and bottom margins,
+            and pad_w is the size of the left and right margins.
+        :type padding: tuple of two ints
        """
        self.ds = tuple(ds)
        if not all([isinstance(d, int) for d in ds]):
@@ -176,10 +192,19 @@ class DownsampleFactorMax(Op):
            st = ds
        self.st = tuple(st)
        self.ignore_border = ignore_border
+        self.padding = tuple(padding)
+        self.padding = padding
+        if padding != (0, 0) and not ignore_border:
+            raise NotImplementedError(
+                'padding works only with ignore_boarder=True')
+        if self.padding[0] >= self.ds[0] or self.padding[1] >= self.ds[1]:
+            raise NotImplementedError(
+                'padding_h and padding_w must be smaller than strides')
    def __str__(self):
-        return '%s{%s,%s,%s}' % (self.__class__.__name__,
+        return '%s{%s, %s, %s, %s}' % (
-                                 self.ds, self.st, self.ignore_border)
+            self.__class__.__name__,
+            self.ds, self.st, self.ignore_border, self.padding)
    def make_node(self, x):
        if x.type.ndim != 4:
@@ -195,22 +220,33 @@ class DownsampleFactorMax(Op):
        if len(x.shape) != 4:
            raise NotImplementedError(
                'DownsampleFactorMax requires 4D input for now')
-        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st)
+        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st,
+                                 self.padding)
        if (z[0] is None) or (z[0].shape != z_shape):
-            z[0] = numpy.empty(self.out_shape(x.shape, self.ds,
+            z[0] = numpy.empty(
-                                              self.ignore_border, self.st),
+                self.out_shape(x.shape, self.ds, self.ignore_border,
+                               self.st, self.padding),
                dtype=x.dtype)
        zz = z[0]
+        # number of pooling output rows
-        #number of pooling output rows
        pr = zz.shape[-2]
-        #number of pooling output cols
+        # number of pooling output cols
        pc = zz.shape[-1]
        ds0, ds1 = self.ds
        st0, st1 = self.st
-        img_rows = x.shape[-2]
+        pad_h = self.padding[0]
-        img_cols = x.shape[-1]
+        pad_w = self.padding[1]
+        img_rows = x.shape[-2] + 2 * pad_h
+        img_cols = x.shape[-1] + 2 * pad_w
+        # pad the image
+        fill = x.min()-1.
+        y = numpy.zeros(
+            (x.shape[0], x.shape[1], img_rows, img_cols),
+            dtype=x.dtype) + fill
+        y[:, :, pad_h:(img_rows-pad_h), pad_w:(img_cols-pad_w)] = x
+        # max pooling
        for n in xrange(x.shape[0]):
            for k in xrange(x.shape[1]):
                for r in xrange(pr):
@@ -219,7 +255,7 @@ class DownsampleFactorMax(Op):
                    for c in xrange(pc):
                        col_st = c * st1
                        col_end = __builtin__.min(col_st + ds1, img_cols)
-                        zz[n, k, r, c] = x[
+                        zz[n, k, r, c] = y[
                            n, k, row_st:row_end, col_st:col_end].max()
    def infer_shape(self, node, in_shapes):
@@ -233,7 +269,7 @@ class DownsampleFactorMax(Op):
        maxout = self(x)
        return [DownsampleFactorMaxGrad(self.ds,
                                        ignore_border=self.ignore_border,
-                                        st=self.st)(
+                                        st=self.st, padding=self.padding)(
                                            x, maxout, gz)]
    def c_code(self, node, name, inp, out, sub):
@@ -318,18 +354,20 @@ class DownsampleFactorMax(Op):
 class DownsampleFactorMaxGrad(Op):
-    __props__ = ('ds', 'ignore_border', 'st')
+    __props__ = ('ds', 'ignore_border', 'st', 'padding')
-    def __init__(self, ds, ignore_border, st=None):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0)):
        self.ds = tuple(ds)
        self.ignore_border = ignore_border
        if st is None:
            st = ds
        self.st = tuple(st)
+        self.padding = tuple(padding)
    def __str__(self):
-        return '%s{%s,%s,%s}' % (self.__class__.__name__,
+        return '%s{%s, %s, %s, %s}' % (
-                                 self.ds, self.st, self.ignore_border)
+            self.__class__.__name__,
+            self.ds, self.st, self.ignore_border, self.padding)
    def make_node(self, x, maxout, gz):
        # make_node should only be called by the grad function of
@@ -343,17 +381,23 @@ class DownsampleFactorMaxGrad(Op):
    def perform(self, node, inp, out):
        x, maxout, gz = inp
        gx_stg, = out
-        gx = numpy.zeros_like(x)
+        # number of pooling output rows
-        #number of pooling output rows
        pr = maxout.shape[-2]
-        #number of pooling output cols
+        # number of pooling output cols
        pc = maxout.shape[-1]
        ds0, ds1 = self.ds
        st0, st1 = self.st
-        img_rows = x.shape[-2]
+        pad_h = self.padding[0]
-        img_cols = x.shape[-1]
+        pad_w = self.padding[1]
+        img_rows = x.shape[-2] + 2 * pad_h
+        img_cols = x.shape[-1] + 2 * pad_w
+        # pad the image
+        fill = x.min()-1
+        y = numpy.zeros(
+            (x.shape[0], x.shape[1], img_rows, img_cols), dtype=x.dtype) + fill
+        y[:, :, pad_h:(img_rows-pad_h), pad_w:(img_cols-pad_w)] = x
+        gx = numpy.zeros_like(y)
        for n in xrange(x.shape[0]):
            for k in xrange(x.shape[1]):
                for r in xrange(pr):
@@ -364,8 +408,10 @@ class DownsampleFactorMaxGrad(Op):
                        col_end = __builtin__.min(col_st + ds1, img_cols)
                        for row_ind in xrange(row_st, row_end):
                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == x[n, k, row_ind, col_ind]):
+                                if (maxout[n, k, r, c] == y[n, k, row_ind, col_ind]):
                                    gx[n, k, row_ind, col_ind] += gz[n, k, r, c]
+        # unpad the image
+        gx = gx[:, :, pad_h:(img_rows-pad_h), pad_w:(img_cols-pad_w)]
        gx_stg[0] = gx
    def infer_shape(self, node, in_shapes):
@@ -374,10 +420,17 @@ class DownsampleFactorMaxGrad(Op):
    def grad(self, inp, grads):
        x, maxout, gz = inp
        ggx, = grads
+        if self.padding == (0, 0):
            return [theano.tensor.zeros_like(x),
                    theano.tensor.zeros_like(maxout),
                    DownsampleFactorMaxGradGrad(
-                    self.ds, ignore_border=self.ignore_border, st=self.st)(x, maxout, ggx)]
+                        self.ds, ignore_border=self.ignore_border,
+                        st=self.st)(x, maxout, ggx)]
+        else:
+            return [theano.tensor.zeros_like(x),
+                    theano.tensor.zeros_like(maxout),
+                    theano.gradients.grad_not_implemented(
+                        self, 2, gz, 'Hessian not implemented with padding')]
    def c_code(self, node, name, inp, out, sub):
        if self.ds != self.st:
@@ -593,9 +646,9 @@ class DownsampleFactorMaxGradGrad(Op):
                               dtype=x.dtype)
        ggz = z[0]
-        #number of pooling output rows
+        # number of pooling output rows
        pr = ggz.shape[-2]
-        #number of pooling output cols
+        # number of pooling output cols
        pc = ggz.shape[-1]
        ds0, ds1 = self.ds
        st0, st1 = self.st

--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
@@ -38,6 +38,49 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    output_val[k][i, j] = numpy.max(patch)
        return output_val
+    @staticmethod
+    def numpy_max_pool_2d_stride_padding(
+            x, ds, ignore_border=True, st=None, padding=(0, 0)):
+        pad_h = padding[0]
+        pad_w = padding[1]
+        h = x.shape[-2]
+        w = x.shape[-1]
+        assert ds[0] > pad_h
+        assert ds[1] > pad_w
+        def pad_img(x):
+            fill = x.min()-1
+            t = numpy.ones((x.shape[0], x.shape[1], 1, 1))
+            ud_bar = (numpy.zeros((pad_h, w)) + fill)[
+                numpy.newaxis, numpy.newaxis, :, :] * t
+            lr_bar = (numpy.zeros((pad_h * 2 + h, pad_w)) + fill)[
+                numpy.newaxis, numpy.newaxis, :, :] * t
+            y = numpy.concatenate([ud_bar, x, ud_bar], axis=2)
+            y = numpy.concatenate([lr_bar, y, lr_bar], axis=3)
+            return y
+        img_rows = h + 2 * pad_h
+        img_cols = w + 2 * pad_w
+        out_r = (img_rows - ds[0]) // st[0] + 1
+        out_c = (img_cols - ds[1]) // st[1] + 1
+        out_shp = list(x.shape[:-2])
+        out_shp.append(out_r)
+        out_shp.append(out_c)
+        ds0, ds1 = ds
+        st0, st1 = st
+        output_val = numpy.zeros(out_shp)
+        tt = []
+        y = pad_img(x)
+        for k in numpy.ndindex(*x.shape[:-2]):
+            for i in range(output_val.shape[-2]):
+                ii_st = i * st[0]
+                ii_end = __builtin__.min(ii_st + ds[0], img_rows)
+                for j in range(output_val.shape[-1]):
+                    jj_st = j * st[1]
+                    jj_end = __builtin__.min(jj_st + ds[1], img_cols)
+                    patch = y[k][ii_st:ii_end, jj_st:jj_end]
+                    output_val[k][i, j] = numpy.max(patch)
+        return output_val
    @staticmethod
    def numpy_max_pool_2d_stride(input, ds, ignore_border=False, st=None):
        '''Helper function, implementing max_pool_2d in pure numpy
@@ -196,6 +239,53 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)
+    def test_DownsampleFactorMaxPaddingStride(self):
+        ignore_border = True  # padding does not support ignore_border=False
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        maxpoolsizes = [(3, 3), (4, 4), (3, 4), (4, 3)]
+        stridesizes = [(2, 2), (2, 2), (1, 1), (1, 2)]
+        paddingsizes = [(2, 2), (1, 2), (2, 1), (0, 0)]
+        imgsizes = [(5, 5), (5, 5), (5, 6), (6, 5)]
+        m = 4 # minibatch
+        c = 10 # channel size
+        images = tensor.dtensor4()
+        for indx in numpy.arange(len(maxpoolsizes)):
+            imgsize = imgsizes[indx]
+            imval = rng.rand(m, c, imgsize[0], imgsize[1])
+            stridesize = stridesizes[indx]
+            maxpoolsize = maxpoolsizes[indx]
+            paddingsize = paddingsizes[indx]
+            numpy_output_val = self.numpy_max_pool_2d_stride_padding(
+                    imval, maxpoolsize, ignore_border, stridesize, paddingsize)
+            maxpool_op = DownsampleFactorMax(
+                maxpoolsize,
+                ignore_border=ignore_border,
+                st=stridesize, padding=paddingsize)(images)
+            f = function([images], maxpool_op)
+            output_val = f(imval)
+            utt.assert_allclose(output_val, numpy_output_val)
+    def test_DownsampleFactorMaxPaddingStride_grad(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        imgsizes = ((10, 10), (10, 5))
+        maxpoolsizes = ((5, 3),(3, 5))
+        stridesizes = ((3, 2), (2, 3))
+        paddingsizes = ((2, 2),(2, 1))
+        for i in range(len(imgsizes)):
+            imgsize = imgsizes[i]
+            imval = rng.rand(1, 1, imgsize[0], imgsize[1]) * 10.0
+            maxpoolsize = maxpoolsizes[i]
+            stridesize = stridesizes[i]
+            paddingsize = paddingsizes[i]
+            def mp(input):
+                return DownsampleFactorMax(
+                    maxpoolsize, ignore_border=True,
+                    st=stridesize,
+                    padding=paddingsize,
+                    )(input)
+            utt.verify_grad(mp, [imval], rng=rng)
    def test_DownsampleFactorMax_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        maxpoolshps = ((1, 1), (3, 2), (2, 3))