Merge pull request #2783 from nouiz/pool_average

Average pool CPU with python code

Merge pull request #2783 from nouiz/pool_average
9dc07802 · abergeron · 54363a8d · 8df6d348 · 9dc07802 · 9dc07802
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -721,7 +721,8 @@ class GpuDnnPoolDesc(GpuOp):
    :param ws: windows size
    :param stride: (dx, dy)
-    :param mode: 'max' or 'average'
+    :param mode: 'max', 'average_inc_pad' or 'average_exc_pad'
+        The old deprecated name 'average' correspond to 'average_inc_pad'
    :param pad: (padX, padY) padding information.
        padX is the size of the left and right borders,
        padY is the size of the top and bottom borders.
@@ -744,7 +745,9 @@ class GpuDnnPoolDesc(GpuOp):
        return False
    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
-        assert mode in ('max', 'average')
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode
        assert len(ws) == 2
        self.ws = ws
@@ -772,8 +775,12 @@ class GpuDnnPoolDesc(GpuOp):
        if self.mode == 'max':
            mode_flag = 'CUDNN_POOLING_MAX'
-        elif self.mode == "average":
+        elif self.mode == "average_inc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
+            if version() == -1:
+                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")
@@ -1194,7 +1201,8 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    :param img: images to do the pooling over
    :param ws: subsampling window size
    :param stride: subsampling stride (default: (1, 1))
-    :param mode: one of 'max', 'average' (default: 'max')
+    :param mode: one of 'max', 'average_inc_pad' or 'average_exc_pad
+        (default: 'max')
    :param pad: (padX, padY) padding information.
        padX is the size of the left and right borders,
        padY is the size of the top and bottom borders.
@@ -1625,7 +1633,7 @@ if True:
    @register_opt('cudnn')
    @local_optimizer([DownsampleFactorMax])
-    def local_pool_dnn_stride(node):
+    def local_pool_dnn_alternative(node):
        if not dnn_available():
            return
        if isinstance(node.op, DownsampleFactorMax):
@@ -1635,9 +1643,10 @@ if True:
            ds = node.op.ds
            stride = node.op.st
            pad = node.op.padding
+            mode = node.op.mode
            if (img.owner and isinstance(img.owner.op, HostFromGpu)):
                ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
-                               ds, stride=stride, pad=pad)
+                               ds, stride=stride, pad=pad, mode=mode)
                return [host_from_gpu(ret)]
    @register_opt('cudnn')
@@ -1667,12 +1676,13 @@ if True:
            ds = node.op.ds
            st = node.op.st
            pad = node.op.padding
+            mode = node.op.mode
            if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
                (out.owner and isinstance(out.owner.op, HostFromGpu)) or
                (inp_grad.owner and isinstance(inp_grad.owner.op,
                                               HostFromGpu))):
-                desc = GpuDnnPoolDesc(ws=ds, stride=st, mode="max", pad=pad)()
+                desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
                if not node.op.ignore_border:
                    return
                ret = GpuDnnPoolGrad()(gpu_contiguous(inp),

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1648,8 +1648,9 @@ import theano.tensor.signal.downsample as downsample
 def local_gpu_downsample_factor_max(node):
    if (isinstance(node.op, downsample.DownsampleFactorMax)
        and node.op.ds == node.op.st):
-        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding')
+        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
-        if node.op.padding != (0, 0):
+                                     'mode')
+        if node.op.padding != (0, 0) or node.op.mode != 'max':
            return
        x, = node.inputs
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
@@ -1662,8 +1663,9 @@ def local_gpu_downsample_factor_max(node):
 def local_gpu_downsample_factor_max_grad(node):
    if (isinstance(node.op, downsample.DownsampleFactorMaxGrad) and
        node.op.ds == node.op.st):
-        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding')
+        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
-        if node.op.padding != (0, 0):
+                                     'mode')
+        if node.op.padding != (0, 0) or node.op.mode != 'max':
            return
        x, z, gz = node.inputs
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
@@ -1678,6 +1680,8 @@ def local_gpu_downsample_factor_max_grad(node):
 @local_optimizer([downsample.DownsampleFactorMaxGradGrad])
 def local_gpu_downsample_factor_max_grad_grad(node):
    if isinstance(node.op, downsample.DownsampleFactorMaxGradGrad):
+        assert node.op.__props__ == ('ds', 'ignore_border', 'st')
        x, z, gx = node.inputs
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            op = GpuDownsampleFactorMaxGradGrad(node.op.ds,

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -183,8 +183,12 @@ def test_pooling():
        raise SkipTest(cuda.dnn.dnn_available.msg)
    x = T.ftensor4()
-    for func, pad in product((T.max, T.mean),
+    for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'),
                             ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
+        if mode == 'max':
+            func = T.max
+        else:
+            func = T.mean
        if pad != (0, 0) and cuda.dnn.version() == -1:
            continue
@@ -195,7 +199,6 @@ def test_pooling():
            for stride in (2, 3):
                if stride > ws:
                    continue
-                if func is T.max:
                if pad[0] > stride or pad[1] > stride:
                    # Not implemented
                    continue
@@ -203,21 +206,16 @@ def test_pooling():
                out1 = max_pool_2d(x, (ws, ws),
                                   st=(stride, stride),
                                   ignore_border=True,
-                                       padding=pad)
+                                   padding=pad, mode=mode)
-                else:
-                    out1 = cuda.dnn.dnn_pool(
-                        x, ws=(ws, ws),
-                        stride=(stride, stride),
-                        pad=pad,
-                        mode='max' if func is T.max else "average")
                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
                                   pad=pad,
                                   pool_function=func)
+                mode_without_gpu2 = mode_without_gpu.including()
+                mode_without_gpu2.check_isfinite = False
                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                            for node in f1.maker.fgraph.apply_nodes])
-                f2 = theano.function([x], out2, mode=mode_without_gpu)
+                f2 = theano.function([x], out2, mode=mode_without_gpu2)
                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                                for node in f2.maker.fgraph.apply_nodes])
                for shp in [(1, 10, 100, 100),
@@ -245,7 +243,7 @@ def test_pooling():
            # This test the CPU grad + opt + GPU implemtentation
            def fn(x):
                return max_pool_2d(x, (ws, ws), ignore_border=True,
-                                   padding=pad)
+                                   padding=pad, mode=mode)
            theano.tests.unittest_tools.verify_grad(fn, [data],
                                                    cast_to_output_type=False,
                                                    mode=mode_with_gpu)
@@ -261,7 +259,7 @@ def test_pooling():
                    x, ws=(ws, ws),
                    stride=(stride, stride),
                    pad=pad,
-                    mode='max' if func is T.max else "average")
+                    mode=mode)
                return dnn_op
            theano.tests.unittest_tools.verify_grad(
                fn, [data],
@@ -274,11 +272,10 @@ def test_pooling():
                        for node in fg.maker.fgraph.toposort()])
            g_out = fg(data)
-            if func is T.max:
            # Compare again the CPU result
            out = max_pool_2d(x, (ws, ws),
                              padding=pad,
-                                  ignore_border=True)
+                              ignore_border=True, mode=mode)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            assert any([isinstance(node.op, DownsampleFactorMaxGrad)
@@ -523,7 +520,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        for params in product(
            [(1, 1), (2, 2), (3, 3)],
            [(1, 1), (2, 2), (3, 3)],
-            ['max', 'average']
+            ['max', 'average_inc_pad', 'average_exc_pad']
        ):
            desc = dnn.GpuDnnPoolDesc(
                ws=params[0],
@@ -559,7 +556,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        for params in product(
            [(1, 1), (2, 2), (3, 3)],
            [(1, 1), (2, 2), (3, 3)],
-            ['max', 'average']
+            ['max', 'average_inc_pad']
        ):
            desc = dnn.GpuDnnPoolDesc(
                ws=params[0],

--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -38,7 +38,8 @@ def max_pool_2d_same_size(input, patch_size):
    return outs
-def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0)):
+def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0),
+                mode='max'):
    """
    Takes as input a N-D tensor, where N >= 2. It downscales the input image by
    the specified factor, by keeping only the maximum value of non-overlapping
@@ -62,11 +63,17 @@ def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0)):
            of the images, pad_h is the size of the top and bottom margins,
            and pad_w is the size of the left and right margins.
    :type padding: tuple of two ints
+    :param mode: 'max', 'average_inc_pad' or 'average_exc_pad'.
+        Operation executed on each window.  `max` always excludes the padding
+        in the computation. `average` gives you the choice to include or
+        exclude it.
+    :type mode: string
    """
    if input.ndim < 2:
        raise NotImplementedError('max_pool_2d requires a dimension >= 2')
    if input.ndim == 4:
-        op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding)
+        op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding,
+                                 mode=mode)
        output = op(input)
        return output
@@ -84,7 +91,8 @@ def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0)):
    input_4D = tensor.reshape(input, new_shape, ndim=4)
    # downsample mini-batch of images
-    op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding)
+    op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding,
+                             mode=mode)
    output = op(input_4D)
    # restore to original shape
@@ -94,12 +102,11 @@ def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0)):
 class DownsampleFactorMax(Op):
    """For N-dimensional tensors, consider that the last two
-    dimensions span images.  This Op downsamples these images by a
+    dimensions span images.  This Op downsamples these images by
-    factor ds, by taking the max over non- overlapping rectangular
+    taking the max or average over different patch.
-    regions.
    """
-    __props__ = ('ds', 'ignore_border', 'st', 'padding')
+    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
    @staticmethod
    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
@@ -178,8 +185,10 @@ class DownsampleFactorMax(Op):
        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
-    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0)):
+    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0),
-        """
+                 mode='max'):
+        """ Take the max or average or different input patches.
        :param ds: downsample factor over rows and column.
                   ds indicates the pool region size.
        :type ds: list or tuple of two ints
@@ -193,13 +202,17 @@ class DownsampleFactorMax(Op):
            over rows/cols to get the the next pool region.
            if st is None, it is considered equal to ds
            (no overlap on pooling regions)
-        : type st: list or tuple of two ints
+        : type st: list or tuple of two ints or None
        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
            of the images, pad_h is the size of the top and bottom margins,
            and pad_w is the size of the left and right margins.
        :type padding: tuple of two ints
+        :param mode: 'max', 'average_inc_pad', 'average_exc_pad'.
+            ('average_inc_pad' exclude the padding from the count,
+            'average_exc_pad' include it)
        """
        self.ds = tuple(ds)
        if not all([isinstance(d, int) for d in ds]):
@@ -208,6 +221,7 @@ class DownsampleFactorMax(Op):
                " Got %s" % str(ds))
        if st is None:
            st = ds
+        assert isinstance(st, (tuple, list))
        self.st = tuple(st)
        self.ignore_border = ignore_border
        self.padding = tuple(padding)
@@ -217,11 +231,11 @@ class DownsampleFactorMax(Op):
        if self.padding[0] >= self.ds[0] or self.padding[1] >= self.ds[1]:
            raise NotImplementedError(
                'padding_h and padding_w must be smaller than strides')
+        if mode not in ['max', 'average_inc_pad', 'average_exc_pad']:
-    def __str__(self):
+            raise ValueError(
-        return '%s{%s, %s, %s, %s}' % (
+                "DownsampleFactorMax mode parameter only support 'max',"
-            self.__class__.__name__,
+                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
-            self.ds, self.st, self.ignore_border, self.padding)
+        self.mode = mode
    def make_node(self, x):
        if x.type.ndim != 4:
@@ -251,27 +265,37 @@ class DownsampleFactorMax(Op):
        pad_w = self.padding[1]
        img_rows = x.shape[-2] + 2 * pad_h
        img_cols = x.shape[-1] + 2 * pad_w
+        inc_pad = self.mode == 'average_inc_pad'
        # pad the image
        if self.padding != (0, 0):
-            fill = x.min()-1.
            y = numpy.zeros(
                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype) + fill
+                dtype=x.dtype)
            y[:, :, pad_h:(img_rows-pad_h), pad_w:(img_cols-pad_w)] = x
        else:
            y = x
-        # max pooling
+        func = numpy.max
+        if self.mode != 'max':
+            func = numpy.average
        for n in xrange(x.shape[0]):
            for k in xrange(x.shape[1]):
                for r in xrange(pr):
                    row_st = r * st0
                    row_end = __builtin__.min(row_st + ds0, img_rows)
+                    if not inc_pad:
+                        row_st = __builtin__.max(row_st, self.padding[0])
+                        row_end = __builtin__.min(row_end, x.shape[-2] + pad_h)
                    for c in xrange(pc):
                        col_st = c * st1
                        col_end = __builtin__.min(col_st + ds1, img_cols)
-                        zz[n, k, r, c] = y[
+                        if not inc_pad:
-                            n, k, row_st:row_end, col_st:col_end].max()
+                            col_st = __builtin__.max(col_st, self.padding[1])
+                            col_end = __builtin__.min(col_end,
+                                                      x.shape[-1] + pad_w)
+                        zz[n, k, r, c] = func(y[
+                            n, k, row_st:row_end, col_st:col_end])
    def infer_shape(self, node, in_shapes):
        shp = self.out_shape(in_shapes[0], self.ds,
@@ -284,13 +308,16 @@ class DownsampleFactorMax(Op):
        maxout = self(x)
        return [DownsampleFactorMaxGrad(self.ds,
                                        ignore_border=self.ignore_border,
-                                        st=self.st, padding=self.padding)(
+                                        st=self.st, padding=self.padding,
+                                        mode=self.mode)(
                                            x, maxout, gz)]
    def c_headers(self):
        return ['<algorithm>']
    def c_code(self, node, name, inp, out, sub):
+        if self.mode != 'max':
+            raise theano.gof.utils.MethodNotDefined()
        x, = inp
        z, = out
        fail = sub['fail']
@@ -441,20 +468,20 @@ class DownsampleFactorMax(Op):
 class DownsampleFactorMaxGrad(Op):
-    __props__ = ('ds', 'ignore_border', 'st', 'padding')
+    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0)):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
        self.ds = tuple(ds)
        self.ignore_border = ignore_border
        if st is None:
            st = ds
        self.st = tuple(st)
        self.padding = tuple(padding)
+        if mode not in ['max', 'average_inc_pad', 'average_exc_pad']:
-    def __str__(self):
+            raise ValueError(
-        return '%s{%s, %s, %s, %s}' % (
+                "DownsampleFactorMax mode parameter only support 'max',"
-            self.__class__.__name__,
+                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
-            self.ds, self.st, self.ignore_border, self.padding)
+        self.mode = mode
    def make_node(self, x, maxout, gz):
        # make_node should only be called by the grad function of
@@ -469,6 +496,8 @@ class DownsampleFactorMaxGrad(Op):
        return Apply(self, [x, maxout, gz], [x.type()])
    def perform(self, node, inp, out):
+        if self.mode != 'max' and self.padding != (0, 0):
+            raise NotImplementedError()
        x, maxout, gz = inp
        gx_stg, = out
        # number of pooling output rows
@@ -481,28 +510,49 @@ class DownsampleFactorMaxGrad(Op):
        pad_w = self.padding[1]
        img_rows = x.shape[-2] + 2 * pad_h
        img_cols = x.shape[-1] + 2 * pad_w
+        inc_pad = self.mode == 'average_inc_pad'
        # pad the image
        if self.padding != (0, 0):
-            fill = x.min()-1
            y = numpy.zeros(
                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype) + fill
+                dtype=x.dtype)
            y[:, :, pad_h:(img_rows-pad_h), pad_w:(img_cols-pad_w)] = x
        else:
            y = x
        gx = numpy.zeros_like(y)
+        if self.mode == 'max':
            for n in xrange(x.shape[0]):
                for k in xrange(x.shape[1]):
                    for r in xrange(pr):
-                    row_st = r * st0
+                        row_st = __builtin__.max(r * st0, self.padding[0])
                        row_end = __builtin__.min(row_st + ds0, img_rows)
                        for c in xrange(pc):
-                        col_st = c * st1
+                            col_st = __builtin__.max(c * st1, self.padding[1])
                            col_end = __builtin__.min(col_st + ds1, img_cols)
                            for row_ind in xrange(row_st, row_end):
                                for col_ind in xrange(col_st, col_end):
                                    if (maxout[n, k, r, c] == y[n, k, row_ind, col_ind]):
                                        gx[n, k, row_ind, col_ind] += gz[n, k, r, c]
+        else:
+            for n in xrange(x.shape[0]):
+                for k in xrange(x.shape[1]):
+                    for r in xrange(pr):
+                        if inc_pad:
+                            row_st = r * st0
+                        else:
+                            row_st = __builtin__.max(r * st0, self.padding[0])
+                        row_end = __builtin__.min(row_st + ds0, img_rows)
+                        for c in xrange(pc):
+                            if inc_pad:
+                                col_st = c * st1
+                            else:
+                                col_st = __builtin__.max(c * st1,
+                                                         self.padding[1])
+                            col_end = __builtin__.min(col_st + ds1, img_cols)
+                            val = gz[n, k, r, c] / ((row_end - row_st) *
+                                                    (col_end - col_st))
+                            gx[n, k, row_st:row_end, col_st:col_end] += val
        # unpad the image
        gx = gx[:, :, pad_h:(img_rows-pad_h), pad_w:(img_cols-pad_w)]
        gx_stg[0] = gx
@@ -513,7 +563,7 @@ class DownsampleFactorMaxGrad(Op):
    def grad(self, inp, grads):
        x, maxout, gz = inp
        ggx, = grads
-        if self.padding == (0, 0):
+        if self.padding == (0, 0) and self.mode == 'max':
            return [theano.tensor.zeros_like(x),
                    theano.tensor.zeros_like(maxout),
                    DownsampleFactorMaxGradGrad(
@@ -528,6 +578,8 @@ class DownsampleFactorMaxGrad(Op):
    def c_code(self, node, name, inp, out, sub):
        if self.ds != self.st or self.padding != (0, 0):
            raise theano.gof.utils.MethodNotDefined()
+        if self.mode != 'max':
+            raise theano.gof.utils.MethodNotDefined()
        x, z, gz = inp
        gx, = out
        fail = sub['fail']
@@ -624,6 +676,7 @@ class DownsampleFactorMaxGrad(Op):
 class DownsampleFactorMaxGradGrad(Op):
+    __props__ = ('ds', 'ignore_border', 'st')
    @staticmethod
    def out_shape(imgshape, ds, ignore_border=False, st=None):
@@ -702,20 +755,6 @@ class DownsampleFactorMaxGradGrad(Op):
            st = ds
        self.st = tuple(st)
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.ds == other.ds
-                and self.st == other.st
-                and self.ignore_border == other.ignore_border)
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.ds) ^ \
-            hash(self.st) ^ hash(self.ignore_border)
-    def __str__(self):
-        return '%s{%s,%s,%s}' % (self.__class__.__name__,
-                                 self.ds, self.st, self.ignore_border)
    def make_node(self, x, maxout, gz):
        # make_node should only be called by the grad function of
        # DownsampleFactorMaxGrad, so these asserts should not fail.

--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
+from itertools import product
 import unittest
 import __builtin__
 import numpy
 import theano
 import theano.tensor as tensor
 from theano.tests import unittest_tools as utt
@@ -12,7 +15,7 @@ from theano import function
 class TestDownsampleFactorMax(utt.InferShapeTester):
    @staticmethod
-    def numpy_max_pool_2d(input, ds, ignore_border=False):
+    def numpy_max_pool_2d(input, ds, ignore_border=False, mode='max'):
        '''Helper function, implementing max_pool_2d in pure numpy'''
        if len(input.shape) < 2:
            raise NotImplementedError('input should have at least 2 dim,'
@@ -29,6 +32,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        out_shp.append(input.shape[-2] / ds[0] + xi)
        out_shp.append(input.shape[-1] / ds[1] + yi)
        output_val = numpy.zeros(out_shp)
+        func = numpy.max
+        if mode != 'max':
+            func = numpy.average
        for k in numpy.ndindex(*input.shape[:-2]):
            for i in range(output_val.shape[-2]):
@@ -36,12 +42,12 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                for j in range(output_val.shape[-1]):
                    jj = j * ds[1]
                    patch = input[k][ii:ii + ds[0], jj:jj + ds[1]]
-                    output_val[k][i, j] = numpy.max(patch)
+                    output_val[k][i, j] = func(patch)
        return output_val
    @staticmethod
    def numpy_max_pool_2d_stride_padding(
-            x, ds, ignore_border=True, st=None, padding=(0, 0)):
+            x, ds, ignore_border=True, st=None, padding=(0, 0), mode='max'):
        pad_h = padding[0]
        pad_w = padding[1]
        h = x.shape[-2]
@@ -50,14 +56,12 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        assert ds[1] > pad_w
        def pad_img(x):
-            fill = x.min()-1
+            y = numpy.zeros(
-            t = numpy.ones((x.shape[0], x.shape[1], 1, 1))
+                (x.shape[0], x.shape[1],
-            ud_bar = (numpy.zeros((pad_h, w)) + fill)[
+                 x.shape[2]+pad_h*2, x.shape[3]+pad_w*2),
-                numpy.newaxis, numpy.newaxis, :, :] * t
+                dtype=x.dtype)
-            lr_bar = (numpy.zeros((pad_h * 2 + h, pad_w)) + fill)[
+            y[:, :, pad_h:(x.shape[2]+pad_h), pad_w:(x.shape[3]+pad_w)] = x
-                numpy.newaxis, numpy.newaxis, :, :] * t
-            y = numpy.concatenate([ud_bar, x, ud_bar], axis=2)
-            y = numpy.concatenate([lr_bar, y, lr_bar], axis=3)
            return y
        img_rows = h + 2 * pad_h
        img_cols = w + 2 * pad_w
@@ -71,19 +75,31 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        output_val = numpy.zeros(out_shp)
        tt = []
        y = pad_img(x)
+        func = numpy.max
+        if mode != 'max':
+            func = numpy.average
+        inc_pad = mode == 'average_inc_pad'
        for k in numpy.ndindex(*x.shape[:-2]):
            for i in range(output_val.shape[-2]):
                ii_st = i * st[0]
                ii_end = __builtin__.min(ii_st + ds[0], img_rows)
+                if not inc_pad:
+                    ii_st = __builtin__.max(ii_st, pad_h)
+                    ii_end = __builtin__.min(ii_end, h + pad_h)
                for j in range(output_val.shape[-1]):
                    jj_st = j * st[1]
                    jj_end = __builtin__.min(jj_st + ds[1], img_cols)
+                    if not inc_pad:
+                        jj_st = __builtin__.max(jj_st, pad_w)
+                        jj_end = __builtin__.min(jj_end, w + pad_w)
                    patch = y[k][ii_st:ii_end, jj_st:jj_end]
-                    output_val[k][i, j] = numpy.max(patch)
+                    output_val[k][i, j] = func(patch)
        return output_val
    @staticmethod
-    def numpy_max_pool_2d_stride(input, ds, ignore_border=False, st=None):
+    def numpy_max_pool_2d_stride(input, ds, ignore_border=False, st=None,
+                                 mode='max'):
        '''Helper function, implementing max_pool_2d in pure numpy
           this function provides st input to indicate the stide size
           for the pooling regions. if not indicated, st == sd.'''
@@ -128,6 +144,10 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        out_shp.append(out_r)
        out_shp.append(out_c)
+        func = numpy.max
+        if mode != 'max':
+            func = numpy.average
        output_val = numpy.zeros(out_shp)
        for k in numpy.ndindex(*input.shape[:-2]):
            for i in range(output_val.shape[-2]):
@@ -137,32 +157,37 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    jj_st = j * st[1]
                    jj_end = __builtin__.min(jj_st + ds[1], img_cols)
                    patch = input[k][ii_st:ii_end, jj_st:jj_end]
-                    output_val[k][i, j] = numpy.max(patch)
+                    output_val[k][i, j] = func(patch)
        return output_val
    def test_DownsampleFactorMax(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        # generate random images
        maxpoolshps = ((1, 1), (2, 2), (3, 3), (2, 3))
-        imval = rng.rand(4, 10, 64, 64)
+        imval = rng.rand(4, 2, 16, 16)
        images = tensor.dtensor4()
+        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
-        for maxpoolshp in maxpoolshps:
+                                                       [True, False],
-            for ignore_border in [True, False]:
+                                                       ['max',
+                                                        'average_inc_pad',
+                                                        'average_exc_pad']):
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border
                # Pure Numpy computation
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
-                                                          ignore_border)
+                                                          ignore_border,
-                output = max_pool_2d(images, maxpoolshp, ignore_border)
+                                                          mode=mode)
+                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                                     mode=mode)
                f = function([images, ], [output, ])
                output_val = f(imval)
                assert numpy.all(output_val == numpy_output_val)
                # DownsampleFactorMax op
                maxpool_op = DownsampleFactorMax(maxpoolshp,
-                                                 ignore_border=ignore_border)(images)
+                                                 ignore_border=ignore_border,
+                                                 mode=mode)(images)
                f = function([images], maxpool_op)
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)
@@ -179,24 +204,30 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                      (4, 10, 14, 14), (4, 10, 6, 6), (4, 10, 4, 3),
                      (4, 10, 12, 14), (4, 10, 4, 5), (4, 10, 3, 2),
                      (4, 10, 12, 14), (4, 10, 5, 6), (4, 10, 4, 3))
+        # The same for each mode
+        outputshps = outputshps + outputshps + outputshps
        images = tensor.dtensor4()
        indx = 0
-        for maxpoolshp in maxpoolshps:
+        for mode, maxpoolshp, ignore_border in product(['max',
-            for ignore_border in [True, False]:
+                                                        'average_inc_pad',
+                                                        'average_exc_pad'],
+                                                       maxpoolshps,
+                                                       [True, False]):
                for stride in stridesizes:
                    outputshp = outputshps[indx]
                    indx += 1
                    # DownsampleFactorMax op
                    numpy_output_val = \
                        self.numpy_max_pool_2d_stride(imval, maxpoolshp,
-                                                      ignore_border, stride)
+                                                      ignore_border, stride,
+                                                      mode)
                    assert numpy_output_val.shape == outputshp, (
                        "outshape is %s, calculated shape is %s"
                        % (outputshp, numpy_output_val.shape))
                    maxpool_op = \
                        DownsampleFactorMax(maxpoolshp,
                                            ignore_border=ignore_border,
-                                            st=stride)(images)
+                                            st=stride, mode=mode)(images)
                    f = function([images], maxpool_op)
                    output_val = f(imval)
                    utt.assert_allclose(output_val, numpy_output_val)
@@ -219,7 +250,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            imval = rng.rand(4, 10, imvsize[0], imvsize[1])
            stride = stridesizes[indx]
            maxpoolshp = maxpoolshps[indx]
-            for ignore_border in [True, False]:
+            for ignore_border, mode in product([True, False],
+                                               ['max', 'average_inc_pad',
+                                                'average_exc_pad']):
                indx_out = indx * 2
                if not ignore_border:
                    indx_out += 1
@@ -227,14 +260,14 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                # DownsampleFactorMax op
                numpy_output_val = \
                    self.numpy_max_pool_2d_stride(imval, maxpoolshp,
-                                                  ignore_border, stride)
+                                                  ignore_border, stride, mode)
                assert numpy_output_val.shape == outputshp, (
                    "outshape is %s, calculated shape is %s"
                    % (outputshp, numpy_output_val.shape))
                maxpool_op = \
                    DownsampleFactorMax(maxpoolshp,
                                        ignore_border=ignore_border,
-                                        st=stride)(images)
+                                        st=stride, mode=mode)(images)
                f = function([images], maxpool_op)
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)
@@ -247,20 +280,24 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        paddingsizes = [(2, 2), (1, 2), (2, 1), (0, 0), (1, 1)]
        imgsizes = [(5, 5), (5, 5), (5, 6), (6, 5), (5, 5)]
        m = 4  # minibatch
-        c = 10  # channel size
+        c = 2  # channel size
        images = tensor.dtensor4()
-        for indx in numpy.arange(len(maxpoolsizes)):
+        for indx, mode in product(numpy.arange(len(maxpoolsizes)),
+                                  ['max', 'average_inc_pad',
+                                   'average_exc_pad']):
            imgsize = imgsizes[indx]
-            imval = rng.rand(m, c, imgsize[0], imgsize[1])
+            imval = rng.rand(m, c, imgsize[0], imgsize[1]) - 0.5
            stridesize = stridesizes[indx]
            maxpoolsize = maxpoolsizes[indx]
            paddingsize = paddingsizes[indx]
            numpy_output_val = self.numpy_max_pool_2d_stride_padding(
-                    imval, maxpoolsize, ignore_border, stridesize, paddingsize)
+                imval, maxpoolsize, ignore_border,
+                stridesize, paddingsize, mode)
            maxpool_op = DownsampleFactorMax(
                maxpoolsize,
                ignore_border=ignore_border,
-                st=stridesize, padding=paddingsize)(images)
+                st=stridesize, padding=paddingsize, mode=mode)(images)
            f = function([images], maxpool_op)
            output_val = f(imval)
            utt.assert_allclose(output_val, numpy_output_val)
@@ -447,20 +484,26 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        imval = rng.rand(4, 5)
        images = tensor.dmatrix()
-        for maxpoolshp in maxpoolshps:
+        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
-            for ignore_border in [True, False]:
+                                                       [True, False],
+                                                       ['max',
+                                                        'average_inc_pad',
+                                                        'average_exc_pad']):
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
-                                                          ignore_border)
+                                                          ignore_border,
-                output = max_pool_2d(images, maxpoolshp, ignore_border)
+                                                          mode=mode)
+                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                                     mode=mode)
                output_val = function([images], output)(imval)
                assert numpy.all(output_val == numpy_output_val), (
                    "output_val is %s, numpy_output_val is %s"
                    % (output_val, numpy_output_val))
                def mp(input):
-                    return max_pool_2d(input, maxpoolshp, ignore_border)
+                    return max_pool_2d(input, maxpoolshp, ignore_border,
+                                       mode=mode)
                utt.verify_grad(mp, [imval], rng=rng)
    def test_max_pool_2d_2D_same_size(self):
@@ -492,13 +535,18 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        imval = rng.rand(2, 3, 4)
        images = tensor.dtensor3()
-        for maxpoolshp in maxpoolshps:
+        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
-            for ignore_border in [True, False]:
+                                                       [True, False],
+                                                       ['max',
+                                                        'average_inc_pad',
+                                                        'average_exc_pad']):
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
-                                                          ignore_border)
+                                                          ignore_border,
-                output = max_pool_2d(images, maxpoolshp, ignore_border)
+                                                          mode)
+                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                                     mode=mode)
                output_val = function([images], output)(imval)
                assert numpy.all(output_val == numpy_output_val), (
                    "output_val is %s, numpy_output_val is %s"
@@ -524,13 +572,18 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        imval = rng.rand(2, 1, 1, 1, 3, 4)
        images = tensor.TensorType('float64', [False] * 6)()
-        for maxpoolshp in maxpoolshps:
+        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
-            for ignore_border in [True, False]:
+                                                       [True, False],
+                                                       ['max',
+                                                        'average_inc_pad',
+                                                        'average_exc_pad']):
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
-                                                          ignore_border)
+                                                          ignore_border,
-                output = max_pool_2d(images, maxpoolshp, ignore_border)
+                                                          mode=mode)
+                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                                     mode=mode)
                output_val = function([images], output)(imval)
                assert numpy.all(output_val == numpy_output_val)