Merge pull request #3679 from shabanian/tensor_signal_pep8

Renamed max_pool_2d to pool_2d and DownsampleFactorMax to Pool

Merge pull request #3679 from shabanian/tensor_signal_pep8
adf81d30 · vdumoulin · 6ce3aa78 · 91fe71d9 · adf81d30 · adf81d30
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -12,8 +12,8 @@ from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
-from theano.tensor.signal.downsample import (
+from theano.tensor.signal.pool import (
-    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
+    Pool, MaxPoolGrad, AveragePoolGrad)
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
@@ -2299,11 +2299,11 @@ if True:
            return [dnn_pool(gpu_contiguous(img), ds, ds)]
    @register_opt('cudnn')
-    @local_optimizer([DownsampleFactorMax])
+    @local_optimizer([Pool])
    def local_pool_dnn_alternative(node):
        if not dnn_available():
            return
-        if isinstance(node.op, DownsampleFactorMax):
+        if isinstance(node.op, Pool):
            if not node.op.ignore_border:
                return
            img, = node.inputs

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -137,14 +137,14 @@ register_opt(name='local_gpu_reshape_chain')(
 # This is a partial list of CPU ops that can be in some circonstance
 # moved to the GPU. This list is used by an optimization.
 # Hopefully, we can keep this list up to date.
-import theano.tensor.signal.downsample
+import theano.tensor.signal.pool
 import theano.tensor.nnet.neighbours
 cpu_ops_moved_to_gpu = [
    tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
    tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
-    tensor.signal.downsample.DownsampleFactorMax,
+    tensor.signal.pool.Pool,
-    tensor.signal.downsample.MaxPoolGrad,
+    tensor.signal.pool.MaxPoolGrad,
-    tensor.signal.downsample.AveragePoolGrad,
+    tensor.signal.pool.AveragePoolGrad,
    theano.tensor.nnet.neighbours.Images2Neibs,
    tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias,
    tensor.nnet.CrossentropySoftmax1HotWithBiasDx,
@@ -1848,13 +1848,13 @@ gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
 # Pooling
-import theano.tensor.signal.downsample as downsample
+import theano.tensor.signal.pool as pool
 @register_opt()
-@local_optimizer([downsample.DownsampleFactorMax])
+@local_optimizer([pool.Pool])
 def local_gpu_downsample_factor_max(node):
-    if (isinstance(node.op, downsample.DownsampleFactorMax)
+    if (isinstance(node.op, pool.Pool)
        and node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
@@ -1868,9 +1868,9 @@ def local_gpu_downsample_factor_max(node):
 @register_opt()
-@local_optimizer([downsample.MaxPoolGrad])
+@local_optimizer([pool.MaxPoolGrad])
 def local_gpu_downsample_factor_max_grad(node):
-    if (isinstance(node.op, downsample.MaxPoolGrad) and
+    if (isinstance(node.op, pool.MaxPoolGrad) and
        node.op.ds == node.op.st):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
@@ -1890,9 +1890,9 @@ def local_gpu_downsample_factor_max_grad(node):
 @register_opt()
-@local_optimizer([downsample.DownsampleFactorMaxGradGrad])
+@local_optimizer([pool.DownsampleFactorMaxGradGrad])
 def local_gpu_downsample_factor_max_grad_grad(node):
-    if isinstance(node.op, downsample.DownsampleFactorMaxGradGrad):
+    if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
        assert node.op.__props__ == ('ds', 'ignore_border', 'st',
                                     'padding', 'mode')
        if node.op.padding != (0, 0) or node.op.mode != 'max':

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -16,7 +16,7 @@ if cuda_ndarray.cuda_available == False:
 import theano.sandbox.cuda as tcn
-from theano.tensor.signal.downsample import (DownsampleFactorMax,
+from theano.tensor.signal.pool import (Pool,
        DownsampleFactorMaxGrad, DownsampleFactorMaxGradGrad)
 import theano.compile.mode
@@ -280,7 +280,7 @@ class TestBlasStridesGpu(TestBlasStrides):
 if 0:
    # This is commented out because it doesn't make sense...
-    # tcn.blas has no op called DownsampleFactorMax
+    # tcn.blas has no op called Pool
    # tcn.blas has an op called GpuDownsampleFactorMax, but that op requires arguments that are
    # CudaNdarrayType variables... so rethink this test?
    def test_maxpool():
@@ -290,7 +290,7 @@ if 0:
                                         [[[[6, 8, 9], [ 16, 18, 19], [ 21, 23, 24]]]])]:
            for border, ret in [(True, r_true), (False, r_false)]:
                ret = numpy.array(ret)
-                a = tcn.blas.DownsampleFactorMax((2, 2), border)
+                a = tcn.blas.Pool((2, 2), border)
                dmatrix4 = tensor.TensorType("float32", (False, False, False, False))
                b = dmatrix4()
                f = pfunc([b], [a(b)], mode=mode_with_gpu)
@@ -347,7 +347,7 @@ def test_downsample():
                continue
            for ignore_border in (True, False):
                # print 'test_downsample', shp, ds, ignore_border
-                ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border)
+                ds_op = Pool(ds, ignore_border=ignore_border)
                a = tcn.shared_constructor(my_rand(*shp), 'a')
                f = pfunc([], ds_op(tensor.as_tensor_variable(a)),
@@ -357,7 +357,7 @@ def test_downsample():
                assert any([isinstance(node.op,
                                       tcn.blas.GpuDownsampleFactorMax)
                    for node in f.maker.fgraph.toposort()])
-                assert any([isinstance(node.op, DownsampleFactorMax)
+                assert any([isinstance(node.op, Pool)
                    for node in f2.maker.fgraph.toposort()])
                assert numpy.allclose(f(), f2())

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -9,8 +9,8 @@ from six import StringIO
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs
-from theano.tensor.signal.downsample import max_pool_2d
+from theano.tensor.signal.pool import pool_2d
-from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
+from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -256,10 +256,10 @@ def test_pooling():
                    # Not implemented
                    continue
                # We will check that the opt introduced it.
-                out1 = max_pool_2d(x, (ws, ws),
+                out1 = pool_2d(x, (ws, ws),
-                                   st=(stride, stride),
+                               st=(stride, stride),
-                                   ignore_border=True,
+                               ignore_border=True,
-                                   padding=pad, mode=mode)
+                               padding=pad, mode=mode)
                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
                                   pad=pad,
                                   pool_function=func)
@@ -294,8 +294,8 @@ def test_pooling():
            # This test the CPU grad + opt + GPU implemtentation
            def fn(x):
-                return max_pool_2d(x, (ws, ws), ignore_border=True,
+                return pool_2d(x, (ws, ws), ignore_border=True,
-                                   padding=pad, mode=mode)
+                               padding=pad, mode=mode)
            theano.tests.unittest_tools.verify_grad(fn, [data],
                                                    cast_to_output_type=False,
                                                    mode=mode_with_gpu)
@@ -325,9 +325,9 @@ def test_pooling():
            g_out = fg(data)
            # Compare again the CPU result
-            out = max_pool_2d(x, (ws, ws),
+            out = pool_2d(x, (ws, ws),
-                              padding=pad,
+                          padding=pad,
-                              ignore_border=True, mode=mode)
+                          ignore_border=True, mode=mode)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            if mode == 'max':
@@ -453,7 +453,7 @@ def test_pooling_opt():
    f = theano.function(
        [x],
-        max_pool_2d(x, ds=(2, 2), mode='average_inc_pad', ignore_border=True),
+        pool_2d(x, ds=(2, 2), mode='average_inc_pad', ignore_border=True),
        mode=mode_with_gpu)
    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
@@ -463,8 +463,8 @@ def test_pooling_opt():
    f = theano.function(
        [x],
-        T.grad(max_pool_2d(x, ds=(2, 2), mode='average_inc_pad',
+        T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
-                           ignore_border=True).sum(), x),
+                       ignore_border=True).sum(), x),
        mode=mode_with_gpu.including("cudnn"))
    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
@@ -618,7 +618,7 @@ def test_dnn_tag():
    try:
        f = theano.function(
            [x],
-            max_pool_2d(x, ds=(2, 2), ignore_border=True),
+            pool_2d(x, ds=(2, 2), ignore_border=True),
            mode=mode_with_gpu.including("cudnn"))
    except (AssertionError, RuntimeError):
        assert not cuda.dnn.dnn_available()

--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
@@ -14,7 +14,7 @@ from theano.compile.pfunc import pfunc
 from theano import tensor
 from theano import config
 import theano.tensor.nnet.conv as conv
-import theano.tensor.signal.downsample as downsample
+import theano.tensor.signal.pool as pool
 import theano.sandbox.cuda as tcn
 import theano.tests.unittest_tools as utt
@@ -372,7 +372,7 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
        (n_kern, logical_hid_shape[0] // 2, logical_hid_shape[1] // 2),
        shape_kern1[2:], n_kern1, n_batch, 1, 1, verbose=verbose, version=version)
-    ds_op = downsample.DownsampleFactorMax((2, 2), ignore_border=False)
+    ds_op = pool.Pool((2, 2), ignore_border=False)
    if downsample_ops:
        hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
    else:
@@ -612,7 +612,7 @@ def test_lenet_32():  # CIFAR10 / Shapeset
 def test_lenet_32_long():  # CIFAR10 / Shapeset
-    # this tests the gradient of downsample on the GPU,
+    # this tests the gradient of pool on the GPU,
    # which does not recieve specific testing
    cmp_run_conv_nnet2_classif(seed, 32, 5, 30, n_train=50,
                               ignore_error=ignore_error, gpu_only=gpu_only,

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -17,9 +17,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs,
                                              get_conv_output_shape)
-from theano.tensor.signal.downsample import (DownsampleFactorMax,
+from theano.tensor.signal.pool import (
-                                             MaxPoolGrad, AveragePoolGrad)
+    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
 from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
@@ -1383,7 +1382,7 @@ def local_dnn_convi_output_merge(node, *inputs):
 @register_opt('cudnn')
-@op_lifter([DownsampleFactorMax])
+@op_lifter([Pool])
 def local_pool_dnn_alternative(node, ctx_name):
    if not dnn_available(ctx_name):
        return

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -9,8 +9,8 @@ from six import StringIO
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs
-from theano.tensor.signal.downsample import max_pool_2d
+from theano.tensor.signal.pool import pool_2d
-from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
+from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad
 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
@@ -185,10 +185,10 @@ def test_pooling():
                    # Not implemented
                    continue
                # We will check that the opt introduced it.
-                out1 = max_pool_2d(x, (ws, ws),
+                out1 = pool_2d(x, (ws, ws),
-                                   st=(stride, stride),
+                               st=(stride, stride),
-                                   ignore_border=True,
+                               ignore_border=True,
-                                   padding=pad, mode=mode)
+                               padding=pad, mode=mode)
                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
                                   pad=pad,
                                   pool_function=func)
@@ -223,8 +223,8 @@ def test_pooling():
            # This test the CPU grad + opt + GPU implemtentation
            def fn(x):
-                return max_pool_2d(x, (ws, ws), ignore_border=True,
+                return pool_2d(x, (ws, ws), ignore_border=True,
-                                   padding=pad, mode=mode)
+                               padding=pad, mode=mode)
            utt.verify_grad(fn, [data],
                            cast_to_output_type=False,
                            mode=mode_with_gpu)
@@ -253,9 +253,9 @@ def test_pooling():
            g_out = fg(data)
            # Compare against the CPU result
-            out = max_pool_2d(x, (ws, ws),
+            out = pool_2d(x, (ws, ws),
-                              padding=pad,
+                          padding=pad,
-                              ignore_border=True, mode=mode)
+                          ignore_border=True, mode=mode)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            if mode == 'max':
@@ -276,8 +276,8 @@ def test_pooling_opt():
    f = theano.function(
        [x],
-        max_pool_2d(x, ds=(2, 2), mode='average_inc_pad',
+        pool_2d(x, ds=(2, 2), mode='average_inc_pad',
-                    ignore_border=True),
+                ignore_border=True),
        mode=mode_with_gpu)
    assert any([isinstance(n.op, dnn.GpuDnnPool)
@@ -287,8 +287,8 @@ def test_pooling_opt():
    f = theano.function(
        [x],
-        T.grad(max_pool_2d(x, ds=(2, 2), mode='average_inc_pad',
+        T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
-                           ignore_border=True).sum(),
+                       ignore_border=True).sum(),
               x),
        mode=mode_with_gpu.including("cudnn"))
@@ -315,7 +315,7 @@ def test_dnn_tag():
    try:
        f = theano.function(
            [x],
-            max_pool_2d(x, ds=(2, 2), ignore_border=True),
+            pool_2d(x, ds=(2, 2), ignore_border=True),
            mode=mode_with_gpu.including("cudnn"))
    except (AssertionError, RuntimeError):
        assert not dnn.dnn_available(test_ctx_name)

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -3,7 +3,7 @@ Contains an Op for convolving input images with a set of filters. This was
 developed especially for Convolutional Neural Networks.
 For related ops, including downsampling and subsampling, see
-tensor.signal and tensor.signal.downsample.
+tensor.signal and tensor.signal.pool.
 See especially conv2d().
 """

--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
-"""
-Ops for downsampling images.
-Planned:
-DownsampleFactorMax, DownsampleAvg, DownsampleSoftmax.
-"""
 from __future__ import print_function
-# This file should move along with conv.py
+import pool
-from six.moves import xrange
-import six.moves.builtins as builtins
 import warnings
-import numpy
+warnings.warn("downsample module has been moved to the pool module.")
+max_pool2D = pool.max_pool2D
-import theano
+max_pool_2d_same_size = pool.max_pool_2d_same_size
-from theano import gof, Op, tensor, Variable, Apply
+max_pool_2d = pool.pool_2d
+DownsampleFactorMax = pool.Pool
-from theano.tensor.opt import register_canonicalize
+PoolGrad = pool.PoolGrad
+MaxPoolGrad = pool.MaxPoolGrad
+AveragePoolGrad = pool.AveragePoolGrad
-def max_pool2D(*args, **kwargs):
+DownsampleFactorMaxGradGrad = pool.DownsampleFactorMaxGradGrad
-    import sys
+local_average_pool_grad = pool.local_average_pool_grad
-    print("DEPRECATION: max_pool2D renamed to max_pool_2d", file=sys.stderr)
-    return max_pool_2d(*args, **kwargs)
-def max_pool_2d_same_size(input, patch_size):
-    """
-    Takes as input a 4-D tensor. It sets all non maximum values
-    of non-overlapping patches of size (patch_size[0],patch_size[1]) to zero,
-    keeping only the maximum values. The output has the same dimensions as
-    the input.
-    Parameters
-    ----------
-    input : 4-D theano tensor of input images
-        Input images. Max pooling will be done over the 2 last dimensions.
-    patch_size : tuple of length 2
-        Size of the patch (patch height, patch width).
-        (2,2) will retain only one non-zero value per patch of 4 values.
-    """
-    output = DownsampleFactorMax(patch_size, True)(input)
-    outs = MaxPoolGrad(patch_size, True)(input, output, output)
-    return outs
-def max_pool_2d(input, ds, ignore_border=None, st=None, padding=(0, 0),
-                mode='max'):
-    """
-    Takes as input a N-D tensor, where N >= 2. It downscales the input image by
-    the specified factor, by keeping only the maximum value of non-overlapping
-    patches of size (ds[0],ds[1])
-    Parameters
-    ----------
-    input : N-D theano tensor of input images
-        Input images. Max pooling will be done over the 2 last dimensions.
-    ds : tuple of length 2
-        Factor by which to downscale (vertical ds, horizontal ds).
-        (2,2) will halve the image in each dimension.
-    ignore_border : bool (default None, will print a warning and set to False)
-        When True, (5,5) input with ds=(2,2) will generate a (2,2) output.
-        (3,3) otherwise.
-    st : tuple of two ints
-        Stride size, which is the number of shifts over rows/cols to get the
-        next pool region. If st is None, it is considered equal to ds
-        (no overlap on pooling regions).
-    padding : tuple of two ints
-        (pad_h, pad_w), pad zeros to extend beyond four borders
-            of the images, pad_h is the size of the top and bottom margins,
-            and pad_w is the size of the left and right margins.
-    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
-        Operation executed on each window. `max` and `sum` always exclude
-        the padding in the computation. `average` gives you the choice to
-        include or exclude it.
-    """
-    if input.ndim < 2:
-        raise NotImplementedError('max_pool_2d requires a dimension >= 2')
-    if ignore_border is None:
-        warnings.warn(
-            "max_pool_2d() will have the parameter ignore_border"
-            " default value changed to True (currently"
-            " False). To have consistent behavior with all Theano"
-            " version, explicitly add the parameter ignore_border=True."
-            " On the GPU, using ignore_border=False is needed to use CuDNN."
-            " When using ignore_border=False and not using CuDNN, the only"
-            " GPU combination supported is when"
-            " `ds == st and padding == (0, 0) and mode == 'max'`."
-            " Otherwise, the convolution will be executed on CPU.",
-            stacklevel=2)
-        ignore_border = False
-    if input.ndim == 4:
-        op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding,
-                                 mode=mode)
-        output = op(input)
-        return output
-    # extract image dimensions
-    img_shape = input.shape[-2:]
-    # count the number of "leading" dimensions, store as dmatrix
-    batch_size = tensor.prod(input.shape[:-2])
-    batch_size = tensor.shape_padright(batch_size, 1)
-    # store as 4D tensor with shape: (batch_size,1,height,width)
-    new_shape = tensor.cast(tensor.join(0, batch_size,
-                                        tensor.as_tensor([1]),
-                                        img_shape), 'int64')
-    input_4D = tensor.reshape(input, new_shape, ndim=4)
-    # downsample mini-batch of images
-    op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding,
-                             mode=mode)
-    output = op(input_4D)
-    # restore to original shape
-    outshp = tensor.join(0, input.shape[:-2], output.shape[-2:])
-    return tensor.reshape(output, outshp, ndim=input.ndim)
-class DownsampleFactorMax(Op):
-    """
-    For N-dimensional tensors, consider that the last two dimensions span
-    images. This Op downsamples these images by taking the max, sum or average
-    over different patch.
-    The constructor takes the max, sum or average or different input patches.
-    Parameters
-    ----------
-    ds : list or tuple of two ints
-        Downsample factor over rows and column.
-        ds indicates the pool region size.
-    ignore_border : bool
-        If ds doesn't divide imgshape, do we include an extra row/col of partial
-        downsampling (False) or ignore it (True).
-    st : list or tuple of two ints or None
-        Stride size, which is the number of shifts over rows/cols to get the
-        next pool region. If st is None, it is considered equal to ds
-        (no overlap on pooling regions).
-    padding: tuple of two ints
-        (pad_h, pad_w), pad zeros to extend beyond four borders of the images,
-        pad_h is the size of the top and bottom margins, and pad_w is the size
-        of the left and right margins.
-    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
-        ('average_inc_pad' excludes the padding from the count,
-        'average_exc_pad' include it)
-    """
-    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
-    @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
-        """
-        Return the shape of the output from this op, for input of given
-        shape and flags.
-        Parameters
-        ----------
-        imgshape : tuple, list, or similar of integer or scalar Theano variable
-            The shape of a tensor of images. The last two elements are
-            interpreted as the number of rows, and the number of cols.
-        ds : list or tuple of two ints
-            Downsample factor over rows and columns this parameter indicates
-            the size of the pooling region.
-        st : list or tuple of two ints
-            The stride size. This is the distance between the pooling regions.
-            If it's set to None, it equals ds.
-        ignore_border : bool
-            If ds doesn't divide imgshape, do we include an extra row/col of
-            partial downsampling (False) or ignore it (True).
-        padding : tuple of two ints
-            (pad_h, pad_w), pad zeros to extend beyond four borders
-            of the images, pad_h is the size of the top and bottom margins,
-            and pad_w is the size of the left and right margins.
-        Returns
-        -------
-        list
-            The shape of the output from this op, for input of given shape.
-            This will have the same length as imgshape, but with last two
-            elements reduced as per the downsampling & ignore_border flags.
-        """
-        if len(imgshape) < 2:
-            raise TypeError('imgshape must have at least two elements '
-                            '(rows, cols)')
-        if st is None:
-            st = ds
-        r, c = imgshape[-2:]
-        r += padding[0] * 2
-        c += padding[1] * 2
-        if ignore_border:
-            if ds[0] == st[0]:
-                nr = r // st[0]
-            else:
-                out_r = (r - ds[0]) // st[0] + 1
-                if isinstance(r, theano.Variable):
-                    nr = tensor.maximum(out_r, 0)
-                else:
-                    nr = numpy.maximum(out_r, 0)
-            if ds[1] == st[1]:
-                nc = c // st[1]
-            else:
-                out_c = (c - ds[1]) // st[1] + 1
-                if isinstance(c, theano.Variable):
-                    nc = tensor.maximum(out_c, 0)
-                else:
-                    nc = numpy.maximum(out_c, 0)
-        else:
-            if isinstance(r, theano.Variable):
-                nr = tensor.switch(tensor.ge(st[0], ds[0]),
-                                   (r - 1) // st[0] + 1,
-                                   tensor.maximum(0, (r - 1 - ds[0]) //
-                                                  st[0] + 1) + 1)
-            elif st[0] >= ds[0]:
-                nr = (r - 1) // st[0] + 1
-            else:
-                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
-            if isinstance(c, theano.Variable):
-                nc = tensor.switch(tensor.ge(st[1], ds[1]),
-                                   (c - 1) // st[1] + 1,
-                                   tensor.maximum(0, (c - 1 - ds[1]) //
-                                                  st[1] + 1) + 1)
-            elif st[1] >= ds[1]:
-                nc = (c - 1) // st[1] + 1
-            else:
-                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
-        rval = list(imgshape[:-2]) + [nr, nc]
-        return rval
-    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0),
-                 mode='max'):
-        self.ds = tuple(ds)
-        if not all([isinstance(d, int) for d in ds]):
-            raise ValueError(
-                "DownsampleFactorMax downsample parameters must be ints."
-                " Got %s" % str(ds))
-        if st is None:
-            st = ds
-        assert isinstance(st, (tuple, list))
-        self.st = tuple(st)
-        self.ignore_border = ignore_border
-        self.padding = tuple(padding)
-        if self.padding != (0, 0) and not ignore_border:
-            raise NotImplementedError(
-                'padding works only with ignore_border=True')
-        if self.padding[0] >= self.ds[0] or self.padding[1] >= self.ds[1]:
-            raise NotImplementedError(
-                'padding_h and padding_w must be smaller than strides')
-        if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
-            raise ValueError(
-                "DownsampleFactorMax mode parameter only support 'max', 'sum',"
-                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
-        self.mode = mode
-    def make_node(self, x):
-        if x.type.ndim != 4:
-            raise TypeError()
-        # TODO: consider restricting the dtype?
-        x = tensor.as_tensor_variable(x)
-        # If the input shape are broadcastable we can have 0 in the output shape
-        broad = x.broadcastable[:2] + (False, False)
-        out = tensor.TensorType(x.dtype, broad)
-        return gof.Apply(self, [x], [out()])
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        if len(x.shape) != 4:
-            raise NotImplementedError(
-                'DownsampleFactorMax requires 4D input for now')
-        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st,
-                                 self.padding)
-        if (z[0] is None) or (z[0].shape != z_shape):
-            z[0] = numpy.empty(z_shape, dtype=x.dtype)
-        zz = z[0]
-        # number of pooling output rows
-        pr = zz.shape[-2]
-        # number of pooling output cols
-        pc = zz.shape[-1]
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pad_h = self.padding[0]
-        pad_w = self.padding[1]
-        img_rows = x.shape[-2] + 2 * pad_h
-        img_cols = x.shape[-1] + 2 * pad_w
-        inc_pad = self.mode == 'average_inc_pad'
-        # pad the image
-        if self.padding != (0, 0):
-            y = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
-        else:
-            y = x
-        func = numpy.max
-        if self.mode == 'sum':
-            func = numpy.sum
-        elif self.mode != 'max':
-            func = numpy.average
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = builtins.min(row_st + ds0, img_rows)
-                    if not inc_pad:
-                        row_st = builtins.max(row_st, self.padding[0])
-                        row_end = builtins.min(row_end, x.shape[-2] + pad_h)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = builtins.min(col_st + ds1, img_cols)
-                        if not inc_pad:
-                            col_st = builtins.max(col_st, self.padding[1])
-                            col_end = builtins.min(col_end,
-                                                   x.shape[-1] + pad_w)
-                        zz[n, k, r, c] = func(y[
-                            n, k, row_st:row_end, col_st:col_end])
-    def infer_shape(self, node, in_shapes):
-        shp = self.out_shape(in_shapes[0], self.ds,
-                             self.ignore_border, self.st, self.padding)
-        return [shp]
-    def grad(self, inp, grads):
-        x, = inp
-        gz, = grads
-        if self.mode == 'max':
-            maxout = self(x)
-            return [MaxPoolGrad(self.ds,
-                                ignore_border=self.ignore_border,
-                                st=self.st, padding=self.padding)(
-                                    x, maxout, gz)]
-        else:
-            return [AveragePoolGrad(self.ds,
-                                    ignore_border=self.ignore_border,
-                                    st=self.st, padding=self.padding,
-                                    mode=self.mode)(
-                                        x, gz)]
-    def c_headers(self):
-        return ['<algorithm>']
-    def c_code(self, node, name, inp, out, sub):
-        if self.mode not in ('max', 'sum', 'average_exc_pad', 'average_inc_pad'):
-            raise theano.gof.utils.MethodNotDefined()
-        x, = inp
-        z, = out
-        fail = sub['fail']
-        ignore_border = int(self.ignore_border)
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pd0, pd1 = self.padding
-        ccode = """
-        int typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
-        int z_r, z_c; // shape of the output
-        int r, c; // shape of the padded_input
-        if(PyArray_NDIM(%(x)s)!=4)
-        {
-            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
-            %(fail)s;
-        }
-        r = PyArray_DIMS(%(x)s)[2];
-        c = PyArray_DIMS(%(x)s)[3];
-        r += %(pd0)s * 2;
-        c += %(pd1)s * 2;
-        if (%(pd0)s != 0 && %(pd1)s != 0 && !%(ignore_border)s)
-            {
-              PyErr_SetString(PyExc_ValueError,
-                "padding must be (0,0) when ignore border is False");
-              %(fail)s;
-            }
-        if (%(ignore_border)s)
-        {
-            // '/' in C is different from '/' in python
-            if (r - %(ds0)s < 0)
-            {
-              z_r = 0;
-            }
-            else
-            {
-              z_r = (r - %(ds0)s) / %(st0)s + 1;
-            }
-            if (c - %(ds1)s < 0)
-            {
-              z_c = 0;
-            }
-            else
-            {
-              z_c = (c - %(ds1)s) / %(st1)s + 1;
-            }
-        }
-        else
-        {
-            // decide how many rows the output has
-            if (%(st0)s >= %(ds0)s)
-            {
-                z_r = (r - 1) / %(st0)s + 1;
-            }
-            else
-            {
-                z_r = std::max(0, (r - 1 - %(ds0)s) / %(st0)s + 1) + 1;
-            }
-            // decide how many columns the output has
-            if (%(st1)s >= %(ds1)s)
-            {
-                z_c = (c - 1) / %(st1)s + 1;
-            }
-            else
-            {
-                z_c = std::max(0, (c - 1 - %(ds1)s) / %(st1)s + 1) + 1;
-            }
-        }
-        // memory allocation of z if necessary
-        if ((!%(z)s)
-          || *PyArray_DIMS(%(z)s)!=4
-          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(x)s)[0])
-          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(x)s)[1])
-          ||(PyArray_DIMS(%(z)s)[2] != z_r)
-          ||(PyArray_DIMS(%(z)s)[3] != z_c)
-          )
-        {
-          if (%(z)s) Py_XDECREF(%(z)s);
-          npy_intp dims[4] = {0,0,0,0};
-          dims[0]=PyArray_DIMS(%(x)s)[0];
-          dims[1]=PyArray_DIMS(%(x)s)[1];
-          dims[2]=z_r;
-          dims[3]=z_c;
-          //TODO: zeros not necessary
-          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-        }
-        // used for indexing a pool region inside the input
-        int r_st, r_end, c_st, c_end;
-        dtype_%(x)s collector; // temp var for the value in a region
-        if (z_r && z_c)
-        {
-            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
-              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
-                for(int i=0; i< z_r; i++){
-                  r_st = i * %(st0)s;
-                  r_end = r_st + %(ds0)s;
-                  // skip the padding
-                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
-                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
-                  // from padded_img space to img space
-                  r_st -= %(pd0)s;
-                  r_end -= %(pd0)s;
-                  // handle the case where no padding, ignore border is True
-                  if (%(ignore_border)s)
-                  {
-                    r_end = r_end > r ? r : r_end;
-                  }
-                  for(int j=0; j<z_c; j++){
-                    c_st = j * %(st1)s;
-                    c_end = c_st + %(ds1)s;
-                    // skip the padding
-                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
-                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
-                    dtype_%(z)s * z = (
-                          (dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
-                    // change coordinates from padding_img space into img space
-                    c_st -= %(pd1)s;
-                    c_end -= %(pd1)s;
-                    // handle the case where no padding, ignore border is True
-                    if (%(ignore_border)s)
-                    {
-                      c_end = c_end > c ? c : c_end;
-                    }
-        """
-        if self.mode == 'max':
-            ccode += """
-                    // use the first element as the initial value of collector
-                    collector = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,r_st,c_st)))[0];
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
-                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        collector = (a > collector) ? a : collector;
-                      }
-                    }
-                    z[0] = collector;
-            """
-        elif self.mode in ('sum', 'average_exc_pad', 'average_inc_pad'):
-            ccode += """
-                    // initialize the sum at zero
-                    collector = ((dtype_%(x)s)(0));
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
-                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        collector += a;
-                      }
-                    }
-            """
-            if self.mode == "sum":
-                ccode += """
-                    z[0] = collector;
-                """
-            elif self.mode == 'average_inc_pad' and self.ignore_border:
-                ccode += """
-                    z[0] = collector / (%(ds0)s * %(ds1)s);
-                """
-            else:
-                ccode += """
-                    z[0] = collector / ((r_end-r_st)*(c_end-c_st));
-                """
-        ccode += """
-                  }
-                }
-              }
-            }
-        }
-        """
-        return ccode % locals()
-    def c_code_cache_version(self):
-        return (0, 6, 8, 3)
-class PoolGrad(Op):
-    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
-    @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
-        """Return the shape of the output from this op, for input of given
-        shape and flags.
-        :param imgshape: the shape of a tensor of images. The last two elements
-            are interpreted as the number of rows, and the number of cols.
-        :type imgshape: tuple, list, or similar of integer or
-            scalar Theano variable.
-        :param ds: downsample factor over rows and columns
-                   this parameter indicates the size of the pooling region
-        :type ds: list or tuple of two ints
-        :param st: the stride size. This is the distance between the pooling
-                   regions. If it's set to None, in which case it equlas ds.
-        :type st: list or tuple of two ints
-        :param ignore_border: if ds doesn't divide imgshape, do we include an
-            extra row/col of partial downsampling (False) or ignore it (True).
-        :type ignore_border: bool
-        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
-            of the images, pad_h is the size of the top and bottom margins,
-            and pad_w is the size of the left and right margins.
-        :type padding: tuple of two ints
-        :rtype: list
-        :returns: the shape of the output from this op, for input of given
-            shape.  This will have the same length as imgshape, but with last
-            two elements reduced as per the downsampling & ignore_border flags.
-        """
-        if len(imgshape) < 2:
-            raise TypeError('imgshape must have at least two elements '
-                            '(rows, cols)')
-        if st is None:
-            st = ds
-        r, c = imgshape[-2:]
-        r += padding[0] * 2
-        c += padding[1] * 2
-        if ignore_border:
-            out_r = (r - ds[0]) // st[0] + 1
-            out_c = (c - ds[1]) // st[1] + 1
-            if isinstance(r, theano.Variable):
-                nr = tensor.maximum(out_r, 0)
-            else:
-                nr = numpy.maximum(out_r, 0)
-            if isinstance(c, theano.Variable):
-                nc = tensor.maximum(out_c, 0)
-            else:
-                nc = numpy.maximum(out_c, 0)
-        else:
-            if isinstance(r, theano.Variable):
-                nr = tensor.switch(tensor.ge(st[0], ds[0]),
-                                   (r - 1) // st[0] + 1,
-                                   tensor.maximum(0, (r - 1 - ds[0]) //
-                                                  st[0] + 1) + 1)
-            elif st[0] >= ds[0]:
-                nr = (r - 1) // st[0] + 1
-            else:
-                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
-            if isinstance(c, theano.Variable):
-                nc = tensor.switch(tensor.ge(st[1], ds[1]),
-                                   (c - 1) // st[1] + 1,
-                                   tensor.maximum(0, (c - 1 - ds[1]) //
-                                                  st[1] + 1) + 1)
-            elif st[1] >= ds[1]:
-                nc = (c - 1) // st[1] + 1
-            else:
-                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
-        rval = list(imgshape[:-2]) + [nr, nc]
-        return rval
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
-        self.ds = tuple(ds)
-        self.ignore_border = ignore_border
-        if st is None:
-            st = ds
-        self.st = tuple(st)
-        self.padding = tuple(padding)
-        if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
-            raise ValueError(
-                "DownsampleFactorMax mode parameter only support 'max', 'sum',"
-                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
-        self.mode = mode
-    def infer_shape(self, node, in_shapes):
-        return [in_shapes[0]]
-class MaxPoolGrad(PoolGrad):
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
-        PoolGrad.__init__(self, ds, ignore_border, st, padding, mode)
-    def make_node(self, x, maxout, gz):
-        # make_node should only be called by the grad function of
-        # DownsampleFactorMax, so these asserts should not fail.
-        assert isinstance(x, Variable) and x.ndim == 4
-        assert isinstance(maxout, Variable) and maxout.ndim == 4
-        assert isinstance(gz, Variable) and gz.ndim == 4
-        x = tensor.as_tensor_variable(x)
-        maxout = tensor.as_tensor_variable(maxout)
-        gz = tensor.as_tensor_variable(gz)
-        return Apply(self, [x, maxout, gz], [x.type()])
-    def perform(self, node, inp, out):
-        assert self.mode == 'max'
-        x, maxout, gz = inp
-        gx_stg, = out
-        # number of pooling output rows
-        pr = maxout.shape[-2]
-        # number of pooling output cols
-        pc = maxout.shape[-1]
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pad_h = self.padding[0]
-        pad_w = self.padding[1]
-        img_rows = x.shape[-2] + 2 * pad_h
-        img_cols = x.shape[-1] + 2 * pad_w
-        # pad the image
-        if self.padding != (0, 0):
-            y = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
-        else:
-            y = x
-        gx = numpy.zeros_like(y)
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = builtins.max(r * st0, self.padding[0])
-                    row_end = builtins.min(row_st + ds0, img_rows)
-                    for c in xrange(pc):
-                        col_st = builtins.max(c * st1, self.padding[1])
-                        col_end = builtins.min(col_st + ds1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == y[n, k, row_ind, col_ind]):
-                                    gx[n, k, row_ind, col_ind] += gz[n, k, r, c]
-        # unpad the image
-        gx = gx[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)]
-        gx_stg[0] = gx
-    def grad(self, inp, grads):
-        x, maxout, gz = inp
-        ggx, = grads
-        return [theano.tensor.zeros_like(x),
-                theano.tensor.zeros_like(maxout),
-                DownsampleFactorMaxGradGrad(
-                    self.ds, ignore_border=self.ignore_border,
-                    st=self.st, padding=self.padding)(x, maxout, ggx)]
-    def c_code(self, node, name, inp, out, sub):
-        assert self.mode == 'max'
-        x, z, gz = inp
-        gx, = out
-        fail = sub['fail']
-        ignore_border = int(self.ignore_border)
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pd0, pd1 = self.padding
-        return """
-        // sanity checks
-        int x_typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
-        int z_typenum = PyArray_ObjectType((PyObject*)%(z)s, 0);
-        int gz_typenum = PyArray_ObjectType((PyObject*)%(gz)s, 0);
-        if ((x_typenum != z_typenum) || (x_typenum != gz_typenum))
-        {
-            PyErr_SetString(PyExc_ValueError, "input types must all match");
-            %(fail)s;
-        }
-        if(PyArray_NDIM(%(x)s)!=4)
-        {
-            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
-            %(fail)s;
-        }
-        if(PyArray_NDIM(%(z)s)!=4)
-        {
-            PyErr_SetString(PyExc_ValueError, "z must be a 4d ndarray");
-            %(fail)s;
-        }
-        if(PyArray_NDIM(%(gz)s)!=4)
-        {
-            PyErr_SetString(PyExc_ValueError, "gz must be a 4d ndarray");
-            %(fail)s;
-        }
-        int z_r, z_c;
-        z_r = PyArray_DIMS(%(z)s)[2];
-        z_c = PyArray_DIMS(%(z)s)[3];
-        int r, c; // shape of the padded_input
-        r = PyArray_DIMS(%(x)s)[2];
-        c = PyArray_DIMS(%(x)s)[3];
-        r += %(pd0)s * 2;
-        c += %(pd1)s * 2;
-        // allocating memory for gx
-        if ((!%(gx)s)
-          || !PyArray_ISCONTIGUOUS(%(gx)s)
-          || *PyArray_DIMS(%(gx)s)!=4
-          ||(PyArray_DIMS(%(gx)s)[0] != PyArray_DIMS(%(x)s)[0])
-          ||(PyArray_DIMS(%(gx)s)[1] != PyArray_DIMS(%(x)s)[1])
-          ||(PyArray_DIMS(%(gx)s)[2] != PyArray_DIMS(%(x)s)[2])
-          ||(PyArray_DIMS(%(gx)s)[3] != PyArray_DIMS(%(x)s)[3])
-          )
-        {
-          Py_XDECREF(%(gx)s);
-          %(gx)s = (PyArrayObject*) PyArray_ZEROS(4, PyArray_DIMS(%(x)s), x_typenum,0);
-        }
-        else {
-          PyArray_FILLWBYTE(%(gx)s, 0);
-        }
-        int r_st, r_end, c_st, c_end; // used to index into the input img x
-        dtype_%(z)s maximum; // temp var for maximum value in a region
-        if (z_r && z_c)
-        {
-            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
-              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
-                for(int i=0; i< z_r; i++){
-                  r_st = i * %(st0)s;
-                  r_end = r_st + %(ds0)s;
-                  // skip the padding
-                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
-                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
-                  // from padded_img space to img space
-                  r_st -= %(pd0)s;
-                  r_end -= %(pd0)s;
-                  for(int j=0; j<z_c; j++){
-                    c_st = j * %(st1)s;
-                    c_end = c_st + %(ds1)s;
-                    // skip the padding
-                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
-                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
-                    // change coordinates from padding_img space into img space
-                    c_st -= %(pd1)s;
-                    c_end -= %(pd1)s;
-                    // the maximum value
-                    maximum = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s,b,k,i,j)))[0];
-                    // the gradient corresponding to this maximum value in z
-                    dtype_%(gz)s * gz = (
-                          (dtype_%(gz)s*)(PyArray_GETPTR4(%(gz)s, b, k, i, j)));
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
-                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        dtype_%(gx)s * gx = (
-                          (dtype_%(gx)s*)(PyArray_GETPTR4(%(gx)s, b, k, m, n)));
-                        if (a == maximum){
-                          gx[0] = gx[0] + gz[0];
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-        }
-        """ % locals()
-    def c_code_cache_version(self):
-        return (0, 7)
-DownsampleFactorMaxGrad = MaxPoolGrad
-class AveragePoolGrad(PoolGrad):
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='average_inc_pad'):
-        assert mode in ['sum', 'average_inc_pad', 'average_exc_pad']
-        PoolGrad.__init__(self, ds, ignore_border, st, padding, mode)
-    def make_node(self, x, gz):
-        # make_node should only be called by the grad function of
-        # DownsampleFactorMax, so these asserts should not fail.
-        assert isinstance(x, Variable) and x.ndim == 4
-        assert isinstance(gz, Variable) and gz.ndim == 4
-        x = tensor.as_tensor_variable(x)
-        gz = tensor.as_tensor_variable(gz)
-        return Apply(self, [x, gz], [x.type()])
-    def perform(self, node, inp, out):
-        if self.mode == 'average_exc_pad' and self.padding != (0, 0):
-            raise NotImplementedError()
-        x, gz = inp
-        gx_stg, = out
-        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st,
-                                 self.padding)
-        if (gx_stg[0] is None) or (gx_stg[0].shape != z_shape):
-            gx_stg[0] = numpy.empty(z_shape, dtype=x.dtype)
-        zz = gx_stg[0]
-        # number of pooling output rows
-        pr = zz.shape[-2]
-        # number of pooling output cols
-        pc = zz.shape[-1]
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pad_h = self.padding[0]
-        pad_w = self.padding[1]
-        img_rows = x.shape[-2] + 2 * pad_h
-        img_cols = x.shape[-1] + 2 * pad_w
-        inc_pad = self.mode == 'average_inc_pad'
-        sum_mode = self.mode == 'sum'
-        # pad the image
-        if self.padding != (0, 0):
-            y = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
-        else:
-            y = x
-        gx = numpy.zeros_like(y)
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    if sum_mode or inc_pad:
-                        row_st = r * st0
-                    else:
-                        row_st = builtins.max(r * st0, self.padding[0])
-                    row_end = builtins.min(row_st + ds0, img_rows)
-                    for c in xrange(pc):
-                        if sum_mode or inc_pad:
-                            col_st = c * st1
-                        else:
-                            col_st = builtins.max(c * st1,
-                                                  self.padding[1])
-                        col_end = builtins.min(col_st + ds1, img_cols)
-                        if sum_mode:
-                            val = gz[n, k, r, c]
-                        else:
-                            val = gz[n, k, r, c] / ((row_end - row_st) *
-                                                    (col_end - col_st))
-                        gx[n, k, row_st:row_end, col_st:col_end] += val
-        # unpad the image
-        gx = gx[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)]
-        gx_stg[0] = gx
-    def grad(self, inp, grads):
-        x, gz = inp
-        ggx, = grads
-        return [theano.tensor.zeros_like(x),
-                DownsampleFactorMax(
-                    self.ds, ignore_border=self.ignore_border,
-                    st=self.st, padding=self.padding, mode=self.mode)(ggx)]
-class DownsampleFactorMaxGradGrad(Op):
-    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
-    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
-        self.ds = tuple(ds)
-        if not all([isinstance(d, int) for d in ds]):
-            raise ValueError(
-                "DownsampleFactorMax downsample parameters must be ints."
-                " Got %s" % str(ds))
-        if st is None:
-            st = ds
-        assert isinstance(st, (tuple, list))
-        self.st = tuple(st)
-        self.ignore_border = ignore_border
-        self.padding = tuple(padding)
-        if self.padding != (0, 0) and not ignore_border:
-            raise NotImplementedError(
-                'padding works only with ignore_border=True')
-        if self.padding[0] >= self.ds[0] or self.padding[1] >= self.ds[1]:
-            raise NotImplementedError(
-                'padding_h and padding_w must be smaller than strides')
-        self.mode = mode
-        assert self.mode == 'max'
-    def make_node(self, x, maxout, gz):
-        # make_node should only be called by the grad function of
-        # MaxPoolGrad, so these asserts should not fail.
-        x = tensor.as_tensor_variable(x)
-        maxout = tensor.as_tensor_variable(maxout)
-        gz = tensor.as_tensor_variable(gz)
-        assert x.ndim == 4
-        assert maxout.ndim == 4
-        assert gz.ndim == 4
-        return Apply(self, [x, maxout, gz], [x.type()])
-    def perform(self, node, inp, out):
-        x, maxout, ggx = inp
-        z, = out
-        if len(x.shape) != 4:
-            raise NotImplementedError(
-                'DownsampleFactorMaxGradGrad requires 4D input for now')
-        if (z[0] is None) or (z[0].shape != x.shape):
-            z[0] = numpy.zeros(x.shape, dtype=x.dtype)
-        ggz = z[0]  # grad wrt maxout_grad has the same shape as maxout
-        # number of pooling output rows
-        pr = ggz.shape[-2]
-        # number of pooling output cols
-        pc = ggz.shape[-1]
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pd0, pd1 = self.padding
-        img_rows = x.shape[-2] + 2 * pd0
-        img_cols = x.shape[-1] + 2 * pd1
-        # pad the image and its gradients
-        if self.padding != (0, 0):
-            y_padded = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype) + x.min() - 1
-            y_padded[:, :, pd0:(img_rows - pd0), pd1:(img_cols - pd1)] = x
-            ggx_padded = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            ggx_padded[:, :, pd0:(img_rows - pd0), pd1:(img_cols - pd1)] = ggx
-        else:
-            y_padded = x
-            ggx_padded = ggx
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = builtins.min(row_st + ds0, img_rows)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = builtins.min(col_st + ds1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == y_padded[n, k, row_ind, col_ind]):
-                                    ggz[n, k, r, c] = ggx_padded[n, k, row_ind, col_ind]
-    def infer_shape(self, node, in_shapes):
-        return [in_shapes[1]]
-    def c_code(self, node, name, inp, out, sub):
-        if self.mode != 'max':
-            raise theano.gof.utils.MethodNotDefined()
-        x, maxout, ggx = inp
-        z, = out  # the grad of grad
-        fail = sub['fail']
-        ignore_border = int(self.ignore_border)
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        pd0, pd1 = self.padding
-        return """
-        int z_typenum = PyArray_ObjectType((PyObject*)%(maxout)s, 0);
-        int z_r, z_c;
-        z_r = PyArray_DIMS(%(maxout)s)[2];
-        z_c = PyArray_DIMS(%(maxout)s)[3];
-        int r, c; // shape of the padded_input
-        r = PyArray_DIMS(%(x)s)[2];
-        c = PyArray_DIMS(%(x)s)[3];
-        r += %(pd0)s * 2;
-        c += %(pd1)s * 2;
-        // allocating memory for output
-        if ((!%(z)s)
-          || !PyArray_ISCONTIGUOUS(%(z)s)
-          || *PyArray_DIMS(%(z)s)!=4
-          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(maxout)s)[0])
-          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(maxout)s)[1])
-          ||(PyArray_DIMS(%(z)s)[2] != PyArray_DIMS(%(maxout)s)[2])
-          ||(PyArray_DIMS(%(z)s)[3] != PyArray_DIMS(%(maxout)s)[3])
-          )
-        {
-          Py_XDECREF(%(z)s);
-          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, PyArray_DIMS(%(maxout)s), z_typenum,0);
-        }
-        else {
-          PyArray_FILLWBYTE(%(z)s, 0);
-        }
-        dtype_%(maxout)s maximum; // temp var for maximum value in a region
-        int r_st, r_end, c_st, c_end; // used to index into the input img x
-        for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
-              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
-                for(int i=0; i< z_r; i++){
-                  r_st = i * %(st0)s;
-                  r_end = r_st + %(ds0)s;
-                  // skip the padding
-                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
-                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
-                  // from padded_img space to img space
-                  r_st -= %(pd0)s;
-                  r_end -= %(pd0)s;
-                  for(int j=0; j<z_c; j++){
-                    c_st = j * %(st1)s;
-                    c_end = c_st + %(ds1)s;
-                    // skip the padding
-                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
-                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
-                    // from padding_img space into img space
-                    c_st -= %(pd1)s;
-                    c_end -= %(pd1)s;
-                    // the maximum value
-                    maximum = ((dtype_%(maxout)s*)(PyArray_GETPTR4(%(maxout)s,b,k,i,j)))[0];
-                    // z at this position
-                    dtype_%(z)s * z = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
-                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        dtype_%(ggx)s * ggx = (
-                          (dtype_%(ggx)s*)(PyArray_GETPTR4(%(ggx)s, b, k, m, n)));
-                        if (a == maximum){
-                          z[0] += ggx[0];
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-         }
-        """ % locals()
-    def c_code_cache_version(self):
-        return (0, 1)
-@register_canonicalize('fast_compile')
-@gof.local_optimizer([MaxPoolGrad])
-def local_average_pool_grad(node):
-    # To assure backward compatibility with
-    # DownsampleFactorMaxGrad
-    if (not isinstance(node.op, MaxPoolGrad) or node.op.mode not in
-            ['sum', 'average_exc_pad', 'average_inc_pad']):
-        return False
-    return [AveragePoolGrad(ds=node.op.ds,
-                            ignore_border=node.op.ignore_border,
-                            st=node.op.st,
-                            padding=node.op.padding,
-                            mode=node.op.mode)(node.inputs[0],
-                                               node.inputs[2])]
--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
+"""
+Ops for downsampling images.
+Planned:
+Pool, DownsampleAvg, DownsampleSoftmax.
+"""
+from __future__ import print_function
+# This file should move along with conv.py
+from six.moves import xrange
+import six.moves.builtins as builtins
+import warnings
+import numpy
+import theano
+from theano import gof, Op, tensor, Variable, Apply
+from theano.tensor.opt import register_canonicalize
+def max_pool2D(*args, **kwargs):
+    import sys
+    print("DEPRECATION: max_pool2D renamed to pool_2d", file=sys.stderr)
+    return pool_2d(*args, **kwargs)
+def max_pool_2d_same_size(input, patch_size):
+    """
+    Takes as input a 4-D tensor. It sets all non maximum values
+    of non-overlapping patches of size (patch_size[0],patch_size[1]) to zero,
+    keeping only the maximum values. The output has the same dimensions as
+    the input.
+    Parameters
+    ----------
+    input : 4-D theano tensor of input images
+        Input images. Max pooling will be done over the 2 last dimensions.
+    patch_size : tuple of length 2
+        Size of the patch (patch height, patch width).
+        (2,2) will retain only one non-zero value per patch of 4 values.
+    """
+    output = Pool(patch_size, True)(input)
+    outs = MaxPoolGrad(patch_size, True)(input, output, output)
+    return outs
+def pool_2d(input, ds, ignore_border=None, st=None, padding=(0, 0),
+                mode='max'):
+    """
+    Takes as input a N-D tensor, where N >= 2. It downscales the input image by
+    the specified factor, by keeping only the maximum value of non-overlapping
+    patches of size (ds[0],ds[1])
+    Parameters
+    ----------
+    input : N-D theano tensor of input images
+        Input images. Max pooling will be done over the 2 last dimensions.
+    ds : tuple of length 2
+        Factor by which to downscale (vertical ds, horizontal ds).
+        (2,2) will halve the image in each dimension.
+    ignore_border : bool (default None, will print a warning and set to False)
+        When True, (5,5) input with ds=(2,2) will generate a (2,2) output.
+        (3,3) otherwise.
+    st : tuple of two ints
+        Stride size, which is the number of shifts over rows/cols to get the
+        next pool region. If st is None, it is considered equal to ds
+        (no overlap on pooling regions).
+    padding : tuple of two ints
+        (pad_h, pad_w), pad zeros to extend beyond four borders
+            of the images, pad_h is the size of the top and bottom margins,
+            and pad_w is the size of the left and right margins.
+    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
+        Operation executed on each window. `max` and `sum` always exclude
+        the padding in the computation. `average` gives you the choice to
+        include or exclude it.
+    """
+    if input.ndim < 2:
+        raise NotImplementedError('pool_2d requires a dimension >= 2')
+    if ignore_border is None:
+        warnings.warn(
+            "pool_2d() will have the parameter ignore_border"
+            " default value changed to True (currently"
+            " False). To have consistent behavior with all Theano"
+            " version, explicitly add the parameter ignore_border=True."
+            " On the GPU, using ignore_border=False is needed to use CuDNN."
+            " When using ignore_border=False and not using CuDNN, the only"
+            " GPU combination supported is when"
+            " `ds == st and padding == (0, 0) and mode == 'max'`."
+            " Otherwise, the convolution will be executed on CPU.",
+            stacklevel=2)
+        ignore_border = False
+    if input.ndim == 4:
+        op = Pool(ds, ignore_border, st=st, padding=padding,
+                  mode=mode)
+        output = op(input)
+        return output
+    # extract image dimensions
+    img_shape = input.shape[-2:]
+    # count the number of "leading" dimensions, store as dmatrix
+    batch_size = tensor.prod(input.shape[:-2])
+    batch_size = tensor.shape_padright(batch_size, 1)
+    # store as 4D tensor with shape: (batch_size,1,height,width)
+    new_shape = tensor.cast(tensor.join(0, batch_size,
+                                        tensor.as_tensor([1]),
+                                        img_shape), 'int64')
+    input_4D = tensor.reshape(input, new_shape, ndim=4)
+    # downsample mini-batch of images
+    op = Pool(ds, ignore_border, st=st, padding=padding,
+              mode=mode)
+    output = op(input_4D)
+    # restore to original shape
+    outshp = tensor.join(0, input.shape[:-2], output.shape[-2:])
+    return tensor.reshape(output, outshp, ndim=input.ndim)
+class Pool(Op):
+    """
+    For N-dimensional tensors, consider that the last two dimensions span
+    images. This Op downsamples these images by taking the max, sum or average
+    over different patch.
+    The constructor takes the max, sum or average or different input patches.
+    Parameters
+    ----------
+    ds : list or tuple of two ints
+        Downsample factor over rows and column.
+        ds indicates the pool region size.
+    ignore_border : bool
+        If ds doesn't divide imgshape, do we include an extra row/col of partial
+        downsampling (False) or ignore it (True).
+    st : list or tuple of two ints or None
+        Stride size, which is the number of shifts over rows/cols to get the
+        next pool region. If st is None, it is considered equal to ds
+        (no overlap on pooling regions).
+    padding: tuple of two ints
+        (pad_h, pad_w), pad zeros to extend beyond four borders of the images,
+        pad_h is the size of the top and bottom margins, and pad_w is the size
+        of the left and right margins.
+    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
+        ('average_inc_pad' excludes the padding from the count,
+        'average_exc_pad' include it)
+    """
+    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
+    @staticmethod
+    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
+        """
+        Return the shape of the output from this op, for input of given
+        shape and flags.
+        Parameters
+        ----------
+        imgshape : tuple, list, or similar of integer or scalar Theano variable
+            The shape of a tensor of images. The last two elements are
+            interpreted as the number of rows, and the number of cols.
+        ds : list or tuple of two ints
+            Downsample factor over rows and columns this parameter indicates
+            the size of the pooling region.
+        st : list or tuple of two ints
+            The stride size. This is the distance between the pooling regions.
+            If it's set to None, it equals ds.
+        ignore_border : bool
+            If ds doesn't divide imgshape, do we include an extra row/col of
+            partial downsampling (False) or ignore it (True).
+        padding : tuple of two ints
+            (pad_h, pad_w), pad zeros to extend beyond four borders
+            of the images, pad_h is the size of the top and bottom margins,
+            and pad_w is the size of the left and right margins.
+        Returns
+        -------
+        list
+            The shape of the output from this op, for input of given shape.
+            This will have the same length as imgshape, but with last two
+            elements reduced as per the downsampling & ignore_border flags.
+        """
+        if len(imgshape) < 2:
+            raise TypeError('imgshape must have at least two elements '
+                            '(rows, cols)')
+        if st is None:
+            st = ds
+        r, c = imgshape[-2:]
+        r += padding[0] * 2
+        c += padding[1] * 2
+        if ignore_border:
+            if ds[0] == st[0]:
+                nr = r // st[0]
+            else:
+                out_r = (r - ds[0]) // st[0] + 1
+                if isinstance(r, theano.Variable):
+                    nr = tensor.maximum(out_r, 0)
+                else:
+                    nr = numpy.maximum(out_r, 0)
+            if ds[1] == st[1]:
+                nc = c // st[1]
+            else:
+                out_c = (c - ds[1]) // st[1] + 1
+                if isinstance(c, theano.Variable):
+                    nc = tensor.maximum(out_c, 0)
+                else:
+                    nc = numpy.maximum(out_c, 0)
+        else:
+            if isinstance(r, theano.Variable):
+                nr = tensor.switch(tensor.ge(st[0], ds[0]),
+                                   (r - 1) // st[0] + 1,
+                                   tensor.maximum(0, (r - 1 - ds[0]) //
+                                                  st[0] + 1) + 1)
+            elif st[0] >= ds[0]:
+                nr = (r - 1) // st[0] + 1
+            else:
+                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
+            if isinstance(c, theano.Variable):
+                nc = tensor.switch(tensor.ge(st[1], ds[1]),
+                                   (c - 1) // st[1] + 1,
+                                   tensor.maximum(0, (c - 1 - ds[1]) //
+                                                  st[1] + 1) + 1)
+            elif st[1] >= ds[1]:
+                nc = (c - 1) // st[1] + 1
+            else:
+                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
+        rval = list(imgshape[:-2]) + [nr, nc]
+        return rval
+    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0),
+                 mode='max'):
+        self.ds = tuple(ds)
+        if not all([isinstance(d, int) for d in ds]):
+            raise ValueError(
+                "Pool downsample parameters must be ints."
+                " Got %s" % str(ds))
+        if st is None:
+            st = ds
+        assert isinstance(st, (tuple, list))
+        self.st = tuple(st)
+        self.ignore_border = ignore_border
+        self.padding = tuple(padding)
+        if self.padding != (0, 0) and not ignore_border:
+            raise NotImplementedError(
+                'padding works only with ignore_border=True')
+        if self.padding[0] >= self.ds[0] or self.padding[1] >= self.ds[1]:
+            raise NotImplementedError(
+                'padding_h and padding_w must be smaller than strides')
+        if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
+            raise ValueError(
+                "Pool mode parameter only support 'max', 'sum',"
+                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
+        self.mode = mode
+    def make_node(self, x):
+        if x.type.ndim != 4:
+            raise TypeError()
+        # TODO: consider restricting the dtype?
+        x = tensor.as_tensor_variable(x)
+        # If the input shape are broadcastable we can have 0 in the output shape
+        broad = x.broadcastable[:2] + (False, False)
+        out = tensor.TensorType(x.dtype, broad)
+        return gof.Apply(self, [x], [out()])
+    def perform(self, node, inp, out):
+        x, = inp
+        z, = out
+        if len(x.shape) != 4:
+            raise NotImplementedError(
+                'Pool requires 4D input for now')
+        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st,
+                                 self.padding)
+        if (z[0] is None) or (z[0].shape != z_shape):
+            z[0] = numpy.empty(z_shape, dtype=x.dtype)
+        zz = z[0]
+        # number of pooling output rows
+        pr = zz.shape[-2]
+        # number of pooling output cols
+        pc = zz.shape[-1]
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pad_h = self.padding[0]
+        pad_w = self.padding[1]
+        img_rows = x.shape[-2] + 2 * pad_h
+        img_cols = x.shape[-1] + 2 * pad_w
+        inc_pad = self.mode == 'average_inc_pad'
+        # pad the image
+        if self.padding != (0, 0):
+            y = numpy.zeros(
+                (x.shape[0], x.shape[1], img_rows, img_cols),
+                dtype=x.dtype)
+            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
+        else:
+            y = x
+        func = numpy.max
+        if self.mode == 'sum':
+            func = numpy.sum
+        elif self.mode != 'max':
+            func = numpy.average
+        for n in xrange(x.shape[0]):
+            for k in xrange(x.shape[1]):
+                for r in xrange(pr):
+                    row_st = r * st0
+                    row_end = builtins.min(row_st + ds0, img_rows)
+                    if not inc_pad:
+                        row_st = builtins.max(row_st, self.padding[0])
+                        row_end = builtins.min(row_end, x.shape[-2] + pad_h)
+                    for c in xrange(pc):
+                        col_st = c * st1
+                        col_end = builtins.min(col_st + ds1, img_cols)
+                        if not inc_pad:
+                            col_st = builtins.max(col_st, self.padding[1])
+                            col_end = builtins.min(col_end,
+                                                   x.shape[-1] + pad_w)
+                        zz[n, k, r, c] = func(y[
+                            n, k, row_st:row_end, col_st:col_end])
+    def infer_shape(self, node, in_shapes):
+        shp = self.out_shape(in_shapes[0], self.ds,
+                             self.ignore_border, self.st, self.padding)
+        return [shp]
+    def grad(self, inp, grads):
+        x, = inp
+        gz, = grads
+        if self.mode == 'max':
+            maxout = self(x)
+            return [MaxPoolGrad(self.ds,
+                                ignore_border=self.ignore_border,
+                                st=self.st, padding=self.padding)(
+                                    x, maxout, gz)]
+        else:
+            return [AveragePoolGrad(self.ds,
+                                    ignore_border=self.ignore_border,
+                                    st=self.st, padding=self.padding,
+                                    mode=self.mode)(
+                                        x, gz)]
+    def c_headers(self):
+        return ['<algorithm>']
+    def c_code(self, node, name, inp, out, sub):
+        if self.mode not in ('max', 'sum', 'average_exc_pad', 'average_inc_pad'):
+            raise theano.gof.utils.MethodNotDefined()
+        x, = inp
+        z, = out
+        fail = sub['fail']
+        ignore_border = int(self.ignore_border)
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pd0, pd1 = self.padding
+        ccode = """
+        int typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
+        int z_r, z_c; // shape of the output
+        int r, c; // shape of the padded_input
+        if(PyArray_NDIM(%(x)s)!=4)
+        {
+            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
+            %(fail)s;
+        }
+        r = PyArray_DIMS(%(x)s)[2];
+        c = PyArray_DIMS(%(x)s)[3];
+        r += %(pd0)s * 2;
+        c += %(pd1)s * 2;
+        if (%(pd0)s != 0 && %(pd1)s != 0 && !%(ignore_border)s)
+            {
+              PyErr_SetString(PyExc_ValueError,
+                "padding must be (0,0) when ignore border is False");
+              %(fail)s;
+            }
+        if (%(ignore_border)s)
+        {
+            // '/' in C is different from '/' in python
+            if (r - %(ds0)s < 0)
+            {
+              z_r = 0;
+            }
+            else
+            {
+              z_r = (r - %(ds0)s) / %(st0)s + 1;
+            }
+            if (c - %(ds1)s < 0)
+            {
+              z_c = 0;
+            }
+            else
+            {
+              z_c = (c - %(ds1)s) / %(st1)s + 1;
+            }
+        }
+        else
+        {
+            // decide how many rows the output has
+            if (%(st0)s >= %(ds0)s)
+            {
+                z_r = (r - 1) / %(st0)s + 1;
+            }
+            else
+            {
+                z_r = std::max(0, (r - 1 - %(ds0)s) / %(st0)s + 1) + 1;
+            }
+            // decide how many columns the output has
+            if (%(st1)s >= %(ds1)s)
+            {
+                z_c = (c - 1) / %(st1)s + 1;
+            }
+            else
+            {
+                z_c = std::max(0, (c - 1 - %(ds1)s) / %(st1)s + 1) + 1;
+            }
+        }
+        // memory allocation of z if necessary
+        if ((!%(z)s)
+          || *PyArray_DIMS(%(z)s)!=4
+          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(x)s)[0])
+          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(x)s)[1])
+          ||(PyArray_DIMS(%(z)s)[2] != z_r)
+          ||(PyArray_DIMS(%(z)s)[3] != z_c)
+          )
+        {
+          if (%(z)s) Py_XDECREF(%(z)s);
+          npy_intp dims[4] = {0,0,0,0};
+          dims[0]=PyArray_DIMS(%(x)s)[0];
+          dims[1]=PyArray_DIMS(%(x)s)[1];
+          dims[2]=z_r;
+          dims[3]=z_c;
+          //TODO: zeros not necessary
+          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
+        }
+        // used for indexing a pool region inside the input
+        int r_st, r_end, c_st, c_end;
+        dtype_%(x)s collector; // temp var for the value in a region
+        if (z_r && z_c)
+        {
+            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+                for(int i=0; i< z_r; i++){
+                  r_st = i * %(st0)s;
+                  r_end = r_st + %(ds0)s;
+                  // skip the padding
+                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
+                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
+                  // from padded_img space to img space
+                  r_st -= %(pd0)s;
+                  r_end -= %(pd0)s;
+                  // handle the case where no padding, ignore border is True
+                  if (%(ignore_border)s)
+                  {
+                    r_end = r_end > r ? r : r_end;
+                  }
+                  for(int j=0; j<z_c; j++){
+                    c_st = j * %(st1)s;
+                    c_end = c_st + %(ds1)s;
+                    // skip the padding
+                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
+                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
+                    dtype_%(z)s * z = (
+                          (dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
+                    // change coordinates from padding_img space into img space
+                    c_st -= %(pd1)s;
+                    c_end -= %(pd1)s;
+                    // handle the case where no padding, ignore border is True
+                    if (%(ignore_border)s)
+                    {
+                      c_end = c_end > c ? c : c_end;
+                    }
+        """
+        if self.mode == 'max':
+            ccode += """
+                    // use the first element as the initial value of collector
+                    collector = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,r_st,c_st)))[0];
+                    // go through the pooled region in the unpadded input
+                    for(int m=r_st; m<r_end; m++)
+                    {
+                      for(int n=c_st; n<c_end; n++)
+                      {
+                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
+                        collector = (a > collector) ? a : collector;
+                      }
+                    }
+                    z[0] = collector;
+            """
+        elif self.mode in ('sum', 'average_exc_pad', 'average_inc_pad'):
+            ccode += """
+                    // initialize the sum at zero
+                    collector = ((dtype_%(x)s)(0));
+                    // go through the pooled region in the unpadded input
+                    for(int m=r_st; m<r_end; m++)
+                    {
+                      for(int n=c_st; n<c_end; n++)
+                      {
+                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
+                        collector += a;
+                      }
+                    }
+            """
+            if self.mode == "sum":
+                ccode += """
+                    z[0] = collector;
+                """
+            elif self.mode == 'average_inc_pad' and self.ignore_border:
+                ccode += """
+                    z[0] = collector / (%(ds0)s * %(ds1)s);
+                """
+            else:
+                ccode += """
+                    z[0] = collector / ((r_end-r_st)*(c_end-c_st));
+                """
+        ccode += """
+                  }
+                }
+              }
+            }
+        }
+        """
+        return ccode % locals()
+    def c_code_cache_version(self):
+        return (0, 6, 8, 3)
+class PoolGrad(Op):
+    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
+    @staticmethod
+    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
+        """Return the shape of the output from this op, for input of given
+        shape and flags.
+        :param imgshape: the shape of a tensor of images. The last two elements
+            are interpreted as the number of rows, and the number of cols.
+        :type imgshape: tuple, list, or similar of integer or
+            scalar Theano variable.
+        :param ds: downsample factor over rows and columns
+                   this parameter indicates the size of the pooling region
+        :type ds: list or tuple of two ints
+        :param st: the stride size. This is the distance between the pooling
+                   regions. If it's set to None, in which case it equlas ds.
+        :type st: list or tuple of two ints
+        :param ignore_border: if ds doesn't divide imgshape, do we include an
+            extra row/col of partial downsampling (False) or ignore it (True).
+        :type ignore_border: bool
+        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
+            of the images, pad_h is the size of the top and bottom margins,
+            and pad_w is the size of the left and right margins.
+        :type padding: tuple of two ints
+        :rtype: list
+        :returns: the shape of the output from this op, for input of given
+            shape.  This will have the same length as imgshape, but with last
+            two elements reduced as per the downsampling & ignore_border flags.
+        """
+        if len(imgshape) < 2:
+            raise TypeError('imgshape must have at least two elements '
+                            '(rows, cols)')
+        if st is None:
+            st = ds
+        r, c = imgshape[-2:]
+        r += padding[0] * 2
+        c += padding[1] * 2
+        if ignore_border:
+            out_r = (r - ds[0]) // st[0] + 1
+            out_c = (c - ds[1]) // st[1] + 1
+            if isinstance(r, theano.Variable):
+                nr = tensor.maximum(out_r, 0)
+            else:
+                nr = numpy.maximum(out_r, 0)
+            if isinstance(c, theano.Variable):
+                nc = tensor.maximum(out_c, 0)
+            else:
+                nc = numpy.maximum(out_c, 0)
+        else:
+            if isinstance(r, theano.Variable):
+                nr = tensor.switch(tensor.ge(st[0], ds[0]),
+                                   (r - 1) // st[0] + 1,
+                                   tensor.maximum(0, (r - 1 - ds[0]) //
+                                                  st[0] + 1) + 1)
+            elif st[0] >= ds[0]:
+                nr = (r - 1) // st[0] + 1
+            else:
+                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
+            if isinstance(c, theano.Variable):
+                nc = tensor.switch(tensor.ge(st[1], ds[1]),
+                                   (c - 1) // st[1] + 1,
+                                   tensor.maximum(0, (c - 1 - ds[1]) //
+                                                  st[1] + 1) + 1)
+            elif st[1] >= ds[1]:
+                nc = (c - 1) // st[1] + 1
+            else:
+                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
+        rval = list(imgshape[:-2]) + [nr, nc]
+        return rval
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
+        self.ds = tuple(ds)
+        self.ignore_border = ignore_border
+        if st is None:
+            st = ds
+        self.st = tuple(st)
+        self.padding = tuple(padding)
+        if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
+            raise ValueError(
+                "Pool mode parameter only support 'max', 'sum',"
+                " 'average_inc_pad' and 'average_exc_pad'. Got %s" % mode)
+        self.mode = mode
+    def infer_shape(self, node, in_shapes):
+        return [in_shapes[0]]
+class MaxPoolGrad(PoolGrad):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
+        PoolGrad.__init__(self, ds, ignore_border, st, padding, mode)
+    def make_node(self, x, maxout, gz):
+        # make_node should only be called by the grad function of
+        # Pool, so these asserts should not fail.
+        assert isinstance(x, Variable) and x.ndim == 4
+        assert isinstance(maxout, Variable) and maxout.ndim == 4
+        assert isinstance(gz, Variable) and gz.ndim == 4
+        x = tensor.as_tensor_variable(x)
+        maxout = tensor.as_tensor_variable(maxout)
+        gz = tensor.as_tensor_variable(gz)
+        return Apply(self, [x, maxout, gz], [x.type()])
+    def perform(self, node, inp, out):
+        assert self.mode == 'max'
+        x, maxout, gz = inp
+        gx_stg, = out
+        # number of pooling output rows
+        pr = maxout.shape[-2]
+        # number of pooling output cols
+        pc = maxout.shape[-1]
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pad_h = self.padding[0]
+        pad_w = self.padding[1]
+        img_rows = x.shape[-2] + 2 * pad_h
+        img_cols = x.shape[-1] + 2 * pad_w
+        # pad the image
+        if self.padding != (0, 0):
+            y = numpy.zeros(
+                (x.shape[0], x.shape[1], img_rows, img_cols),
+                dtype=x.dtype)
+            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
+        else:
+            y = x
+        gx = numpy.zeros_like(y)
+        for n in xrange(x.shape[0]):
+            for k in xrange(x.shape[1]):
+                for r in xrange(pr):
+                    row_st = builtins.max(r * st0, self.padding[0])
+                    row_end = builtins.min(row_st + ds0, img_rows)
+                    for c in xrange(pc):
+                        col_st = builtins.max(c * st1, self.padding[1])
+                        col_end = builtins.min(col_st + ds1, img_cols)
+                        for row_ind in xrange(row_st, row_end):
+                            for col_ind in xrange(col_st, col_end):
+                                if (maxout[n, k, r, c] == y[n, k, row_ind, col_ind]):
+                                    gx[n, k, row_ind, col_ind] += gz[n, k, r, c]
+        # unpad the image
+        gx = gx[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)]
+        gx_stg[0] = gx
+    def grad(self, inp, grads):
+        x, maxout, gz = inp
+        ggx, = grads
+        return [theano.tensor.zeros_like(x),
+                theano.tensor.zeros_like(maxout),
+                DownsampleFactorMaxGradGrad(
+                    self.ds, ignore_border=self.ignore_border,
+                    st=self.st, padding=self.padding)(x, maxout, ggx)]
+    def c_code(self, node, name, inp, out, sub):
+        assert self.mode == 'max'
+        x, z, gz = inp
+        gx, = out
+        fail = sub['fail']
+        ignore_border = int(self.ignore_border)
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pd0, pd1 = self.padding
+        return """
+        // sanity checks
+        int x_typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
+        int z_typenum = PyArray_ObjectType((PyObject*)%(z)s, 0);
+        int gz_typenum = PyArray_ObjectType((PyObject*)%(gz)s, 0);
+        if ((x_typenum != z_typenum) || (x_typenum != gz_typenum))
+        {
+            PyErr_SetString(PyExc_ValueError, "input types must all match");
+            %(fail)s;
+        }
+        if(PyArray_NDIM(%(x)s)!=4)
+        {
+            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
+            %(fail)s;
+        }
+        if(PyArray_NDIM(%(z)s)!=4)
+        {
+            PyErr_SetString(PyExc_ValueError, "z must be a 4d ndarray");
+            %(fail)s;
+        }
+        if(PyArray_NDIM(%(gz)s)!=4)
+        {
+            PyErr_SetString(PyExc_ValueError, "gz must be a 4d ndarray");
+            %(fail)s;
+        }
+        int z_r, z_c;
+        z_r = PyArray_DIMS(%(z)s)[2];
+        z_c = PyArray_DIMS(%(z)s)[3];
+        int r, c; // shape of the padded_input
+        r = PyArray_DIMS(%(x)s)[2];
+        c = PyArray_DIMS(%(x)s)[3];
+        r += %(pd0)s * 2;
+        c += %(pd1)s * 2;
+        // allocating memory for gx
+        if ((!%(gx)s)
+          || !PyArray_ISCONTIGUOUS(%(gx)s)
+          || *PyArray_DIMS(%(gx)s)!=4
+          ||(PyArray_DIMS(%(gx)s)[0] != PyArray_DIMS(%(x)s)[0])
+          ||(PyArray_DIMS(%(gx)s)[1] != PyArray_DIMS(%(x)s)[1])
+          ||(PyArray_DIMS(%(gx)s)[2] != PyArray_DIMS(%(x)s)[2])
+          ||(PyArray_DIMS(%(gx)s)[3] != PyArray_DIMS(%(x)s)[3])
+          )
+        {
+          Py_XDECREF(%(gx)s);
+          %(gx)s = (PyArrayObject*) PyArray_ZEROS(4, PyArray_DIMS(%(x)s), x_typenum,0);
+        }
+        else {
+          PyArray_FILLWBYTE(%(gx)s, 0);
+        }
+        int r_st, r_end, c_st, c_end; // used to index into the input img x
+        dtype_%(z)s maximum; // temp var for maximum value in a region
+        if (z_r && z_c)
+        {
+            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+                for(int i=0; i< z_r; i++){
+                  r_st = i * %(st0)s;
+                  r_end = r_st + %(ds0)s;
+                  // skip the padding
+                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
+                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
+                  // from padded_img space to img space
+                  r_st -= %(pd0)s;
+                  r_end -= %(pd0)s;
+                  for(int j=0; j<z_c; j++){
+                    c_st = j * %(st1)s;
+                    c_end = c_st + %(ds1)s;
+                    // skip the padding
+                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
+                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
+                    // change coordinates from padding_img space into img space
+                    c_st -= %(pd1)s;
+                    c_end -= %(pd1)s;
+                    // the maximum value
+                    maximum = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s,b,k,i,j)))[0];
+                    // the gradient corresponding to this maximum value in z
+                    dtype_%(gz)s * gz = (
+                          (dtype_%(gz)s*)(PyArray_GETPTR4(%(gz)s, b, k, i, j)));
+                    // go through the pooled region in the unpadded input
+                    for(int m=r_st; m<r_end; m++)
+                    {
+                      for(int n=c_st; n<c_end; n++)
+                      {
+                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
+                        dtype_%(gx)s * gx = (
+                          (dtype_%(gx)s*)(PyArray_GETPTR4(%(gx)s, b, k, m, n)));
+                        if (a == maximum){
+                          gx[0] = gx[0] + gz[0];
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+        }
+        """ % locals()
+    def c_code_cache_version(self):
+        return (0, 7)
+DownsampleFactorMaxGrad = MaxPoolGrad
+class AveragePoolGrad(PoolGrad):
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='average_inc_pad'):
+        assert mode in ['sum', 'average_inc_pad', 'average_exc_pad']
+        PoolGrad.__init__(self, ds, ignore_border, st, padding, mode)
+    def make_node(self, x, gz):
+        # make_node should only be called by the grad function of
+        # Pool, so these asserts should not fail.
+        assert isinstance(x, Variable) and x.ndim == 4
+        assert isinstance(gz, Variable) and gz.ndim == 4
+        x = tensor.as_tensor_variable(x)
+        gz = tensor.as_tensor_variable(gz)
+        return Apply(self, [x, gz], [x.type()])
+    def perform(self, node, inp, out):
+        if self.mode == 'average_exc_pad' and self.padding != (0, 0):
+            raise NotImplementedError()
+        x, gz = inp
+        gx_stg, = out
+        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st,
+                                 self.padding)
+        if (gx_stg[0] is None) or (gx_stg[0].shape != z_shape):
+            gx_stg[0] = numpy.empty(z_shape, dtype=x.dtype)
+        zz = gx_stg[0]
+        # number of pooling output rows
+        pr = zz.shape[-2]
+        # number of pooling output cols
+        pc = zz.shape[-1]
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pad_h = self.padding[0]
+        pad_w = self.padding[1]
+        img_rows = x.shape[-2] + 2 * pad_h
+        img_cols = x.shape[-1] + 2 * pad_w
+        inc_pad = self.mode == 'average_inc_pad'
+        sum_mode = self.mode == 'sum'
+        # pad the image
+        if self.padding != (0, 0):
+            y = numpy.zeros(
+                (x.shape[0], x.shape[1], img_rows, img_cols),
+                dtype=x.dtype)
+            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
+        else:
+            y = x
+        gx = numpy.zeros_like(y)
+        for n in xrange(x.shape[0]):
+            for k in xrange(x.shape[1]):
+                for r in xrange(pr):
+                    if sum_mode or inc_pad:
+                        row_st = r * st0
+                    else:
+                        row_st = builtins.max(r * st0, self.padding[0])
+                    row_end = builtins.min(row_st + ds0, img_rows)
+                    for c in xrange(pc):
+                        if sum_mode or inc_pad:
+                            col_st = c * st1
+                        else:
+                            col_st = builtins.max(c * st1,
+                                                  self.padding[1])
+                        col_end = builtins.min(col_st + ds1, img_cols)
+                        if sum_mode:
+                            val = gz[n, k, r, c]
+                        else:
+                            val = gz[n, k, r, c] / ((row_end - row_st) *
+                                                    (col_end - col_st))
+                        gx[n, k, row_st:row_end, col_st:col_end] += val
+        # unpad the image
+        gx = gx[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)]
+        gx_stg[0] = gx
+    def grad(self, inp, grads):
+        x, gz = inp
+        ggx, = grads
+        return [theano.tensor.zeros_like(x),
+                Pool(self.ds, ignore_border=self.ignore_border,
+                      st=self.st, padding=self.padding, mode=self.mode)(ggx)]
+class DownsampleFactorMaxGradGrad(Op):
+    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
+    def __init__(self, ds, ignore_border, st=None, padding=(0, 0), mode='max'):
+        self.ds = tuple(ds)
+        if not all([isinstance(d, int) for d in ds]):
+            raise ValueError(
+                "Pool downsample parameters must be ints."
+                " Got %s" % str(ds))
+        if st is None:
+            st = ds
+        assert isinstance(st, (tuple, list))
+        self.st = tuple(st)
+        self.ignore_border = ignore_border
+        self.padding = tuple(padding)
+        if self.padding != (0, 0) and not ignore_border:
+            raise NotImplementedError(
+                'padding works only with ignore_border=True')
+        if self.padding[0] >= self.ds[0] or self.padding[1] >= self.ds[1]:
+            raise NotImplementedError(
+                'padding_h and padding_w must be smaller than strides')
+        self.mode = mode
+        assert self.mode == 'max'
+    def make_node(self, x, maxout, gz):
+        # make_node should only be called by the grad function of
+        # MaxPoolGrad, so these asserts should not fail.
+        x = tensor.as_tensor_variable(x)
+        maxout = tensor.as_tensor_variable(maxout)
+        gz = tensor.as_tensor_variable(gz)
+        assert x.ndim == 4
+        assert maxout.ndim == 4
+        assert gz.ndim == 4
+        return Apply(self, [x, maxout, gz], [x.type()])
+    def perform(self, node, inp, out):
+        x, maxout, ggx = inp
+        z, = out
+        if len(x.shape) != 4:
+            raise NotImplementedError(
+                'DownsampleFactorMaxGradGrad requires 4D input for now')
+        if (z[0] is None) or (z[0].shape != x.shape):
+            z[0] = numpy.zeros(x.shape, dtype=x.dtype)
+        ggz = z[0]  # grad wrt maxout_grad has the same shape as maxout
+        # number of pooling output rows
+        pr = ggz.shape[-2]
+        # number of pooling output cols
+        pc = ggz.shape[-1]
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pd0, pd1 = self.padding
+        img_rows = x.shape[-2] + 2 * pd0
+        img_cols = x.shape[-1] + 2 * pd1
+        # pad the image and its gradients
+        if self.padding != (0, 0):
+            y_padded = numpy.zeros(
+                (x.shape[0], x.shape[1], img_rows, img_cols),
+                dtype=x.dtype) + x.min() - 1
+            y_padded[:, :, pd0:(img_rows - pd0), pd1:(img_cols - pd1)] = x
+            ggx_padded = numpy.zeros(
+                (x.shape[0], x.shape[1], img_rows, img_cols),
+                dtype=x.dtype)
+            ggx_padded[:, :, pd0:(img_rows - pd0), pd1:(img_cols - pd1)] = ggx
+        else:
+            y_padded = x
+            ggx_padded = ggx
+        for n in xrange(x.shape[0]):
+            for k in xrange(x.shape[1]):
+                for r in xrange(pr):
+                    row_st = r * st0
+                    row_end = builtins.min(row_st + ds0, img_rows)
+                    for c in xrange(pc):
+                        col_st = c * st1
+                        col_end = builtins.min(col_st + ds1, img_cols)
+                        for row_ind in xrange(row_st, row_end):
+                            for col_ind in xrange(col_st, col_end):
+                                if (maxout[n, k, r, c] == y_padded[n, k, row_ind, col_ind]):
+                                    ggz[n, k, r, c] = ggx_padded[n, k, row_ind, col_ind]
+    def infer_shape(self, node, in_shapes):
+        return [in_shapes[1]]
+    def c_code(self, node, name, inp, out, sub):
+        if self.mode != 'max':
+            raise theano.gof.utils.MethodNotDefined()
+        x, maxout, ggx = inp
+        z, = out  # the grad of grad
+        fail = sub['fail']
+        ignore_border = int(self.ignore_border)
+        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pd0, pd1 = self.padding
+        return """
+        int z_typenum = PyArray_ObjectType((PyObject*)%(maxout)s, 0);
+        int z_r, z_c;
+        z_r = PyArray_DIMS(%(maxout)s)[2];
+        z_c = PyArray_DIMS(%(maxout)s)[3];
+        int r, c; // shape of the padded_input
+        r = PyArray_DIMS(%(x)s)[2];
+        c = PyArray_DIMS(%(x)s)[3];
+        r += %(pd0)s * 2;
+        c += %(pd1)s * 2;
+        // allocating memory for output
+        if ((!%(z)s)
+          || !PyArray_ISCONTIGUOUS(%(z)s)
+          || *PyArray_DIMS(%(z)s)!=4
+          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(maxout)s)[0])
+          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(maxout)s)[1])
+          ||(PyArray_DIMS(%(z)s)[2] != PyArray_DIMS(%(maxout)s)[2])
+          ||(PyArray_DIMS(%(z)s)[3] != PyArray_DIMS(%(maxout)s)[3])
+          )
+        {
+          Py_XDECREF(%(z)s);
+          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, PyArray_DIMS(%(maxout)s), z_typenum,0);
+        }
+        else {
+          PyArray_FILLWBYTE(%(z)s, 0);
+        }
+        dtype_%(maxout)s maximum; // temp var for maximum value in a region
+        int r_st, r_end, c_st, c_end; // used to index into the input img x
+        for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+                for(int i=0; i< z_r; i++){
+                  r_st = i * %(st0)s;
+                  r_end = r_st + %(ds0)s;
+                  // skip the padding
+                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
+                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
+                  // from padded_img space to img space
+                  r_st -= %(pd0)s;
+                  r_end -= %(pd0)s;
+                  for(int j=0; j<z_c; j++){
+                    c_st = j * %(st1)s;
+                    c_end = c_st + %(ds1)s;
+                    // skip the padding
+                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
+                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
+                    // from padding_img space into img space
+                    c_st -= %(pd1)s;
+                    c_end -= %(pd1)s;
+                    // the maximum value
+                    maximum = ((dtype_%(maxout)s*)(PyArray_GETPTR4(%(maxout)s,b,k,i,j)))[0];
+                    // z at this position
+                    dtype_%(z)s * z = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
+                    // go through the pooled region in the unpadded input
+                    for(int m=r_st; m<r_end; m++)
+                    {
+                      for(int n=c_st; n<c_end; n++)
+                      {
+                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
+                        dtype_%(ggx)s * ggx = (
+                          (dtype_%(ggx)s*)(PyArray_GETPTR4(%(ggx)s, b, k, m, n)));
+                        if (a == maximum){
+                          z[0] += ggx[0];
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+         }
+        """ % locals()
+    def c_code_cache_version(self):
+        return (0, 1)
+@register_canonicalize('fast_compile')
+@gof.local_optimizer([MaxPoolGrad])
+def local_average_pool_grad(node):
+    # To assure backward compatibility with
+    # DownsampleFactorMaxGrad
+    if (not isinstance(node.op, MaxPoolGrad) or node.op.mode not in
+            ['sum', 'average_exc_pad', 'average_inc_pad']):
+        return False
+    return [AveragePoolGrad(ds=node.op.ds,
+                            ignore_border=node.op.ignore_border,
+                            st=node.op.st,
+                            padding=node.op.padding,
+                            mode=node.op.mode)(node.inputs[0],
+                                               node.inputs[2])]
--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
 from itertools import product
 import unittest
 import six.moves.builtins as builtins
@@ -7,11 +8,10 @@ import numpy
 import theano
 import theano.tensor as tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.signal.downsample import (DownsampleFactorMax, max_pool_2d,
+from theano.tensor.signal.pool import (Pool, pool_2d,
-                                             MaxPoolGrad, AveragePoolGrad,
+                                       MaxPoolGrad, AveragePoolGrad,
-                                             DownsampleFactorMaxGrad,
+                                       DownsampleFactorMaxGrad,
-                                             DownsampleFactorMaxGradGrad,
+                                       max_pool_2d_same_size)
-                                             max_pool_2d_same_size)
 from theano import function
@@ -19,7 +19,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
    @staticmethod
    def numpy_max_pool_2d(input, ds, ignore_border=False, mode='max'):
-        '''Helper function, implementing max_pool_2d in pure numpy'''
+        '''Helper function, implementing pool_2d in pure numpy'''
        if len(input.shape) < 2:
            raise NotImplementedError('input should have at least 2 dim,'
                                      ' shape is %s'
@@ -106,7 +106,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
    @staticmethod
    def numpy_max_pool_2d_stride(input, ds, ignore_border=False, st=None,
                                 mode='max'):
-        '''Helper function, implementing max_pool_2d in pure numpy
+        '''Helper function, implementing pool_2d in pure numpy
           this function provides st input to indicate the stide size
           for the pooling regions. if not indicated, st == sd.'''
        if len(input.shape) < 2:
@@ -185,19 +185,19 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
                                                          ignore_border,
                                                          mode=mode)
-                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                output = pool_2d(images, maxpoolshp, ignore_border,
-                                     mode=mode)
+                                 mode=mode)
                f = function([images, ], [output, ])
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)
-                # DownsampleFactorMax op
+                # Pool op
-                maxpool_op = DownsampleFactorMax(maxpoolshp,
+                maxpool_op = Pool(maxpoolshp,
-                                                 ignore_border=ignore_border,
+                                  ignore_border=ignore_border,
-                                                 mode=mode)(images)
+                                  mode=mode)(images)
-                output_shape = DownsampleFactorMax.out_shape(imval.shape, maxpoolshp,
+                output_shape = Pool.out_shape(imval.shape, maxpoolshp,
-                                                        ignore_border=ignore_border)
+                                              ignore_border=ignore_border)
                utt.assert_allclose(numpy.asarray(output_shape), numpy_output_val.shape)
                f = function([images], maxpool_op)
                output_val = f(imval)
@@ -227,7 +227,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                for stride in stridesizes:
                    outputshp = outputshps[indx % len(outputshps)]
                    indx += 1
-                    # DownsampleFactorMax op
+                    # Pool op
                    numpy_output_val = \
                        self.numpy_max_pool_2d_stride(imval, maxpoolshp,
                                                      ignore_border, stride,
@@ -236,9 +236,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                        "outshape is %s, calculated shape is %s"
                        % (outputshp, numpy_output_val.shape))
                    maxpool_op = \
-                        DownsampleFactorMax(maxpoolshp,
+                        Pool(maxpoolshp,
-                                            ignore_border=ignore_border,
+                             ignore_border=ignore_border,
-                                            st=stride, mode=mode)(images)
+                             st=stride, mode=mode)(images)
                    f = function([images], maxpool_op)
                    output_val = f(imval)
                    utt.assert_allclose(output_val, numpy_output_val)
@@ -269,7 +269,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                if not ignore_border:
                    indx_out += 1
                outputshp = outputshps[indx_out]
-                # DownsampleFactorMax op
+                # Pool op
                numpy_output_val = \
                    self.numpy_max_pool_2d_stride(imval, maxpoolshp,
                                                  ignore_border, stride, mode)
@@ -277,9 +277,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    "outshape is %s, calculated shape is %s"
                    % (outputshp, numpy_output_val.shape))
                maxpool_op = \
-                    DownsampleFactorMax(maxpoolshp,
+                    Pool(maxpoolshp,
-                                        ignore_border=ignore_border,
+                         ignore_border=ignore_border,
-                                        st=stride, mode=mode)(images)
+                         st=stride, mode=mode)(images)
                f = function([images], maxpool_op)
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)
@@ -306,7 +306,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            numpy_output_val = self.numpy_max_pool_2d_stride_padding(
                imval, maxpoolsize, ignore_border,
                stridesize, paddingsize, mode)
-            maxpool_op = DownsampleFactorMax(
+            maxpool_op = Pool(
                maxpoolsize,
                ignore_border=ignore_border,
                st=stridesize, padding=paddingsize, mode=mode)(images)
@@ -331,7 +331,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                paddingsize = paddingsizes[i]
                def mp(input):
-                    return DownsampleFactorMax(
+                    return Pool(
                        maxpoolsize, ignore_border=True,
                        st=stridesize,
                        padding=paddingsize,
@@ -352,9 +352,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                                                        'average_inc_pad',
                                                        'average_exc_pad']):
            def mp(input):
-                return DownsampleFactorMax(maxpoolshp,
+                return Pool(maxpoolshp,
-                                           ignore_border=ignore_border,
+                            ignore_border=ignore_border,
-                                           mode=mode)(input)
+                            mode=mode)(input)
            utt.verify_grad(mp, [imval], rng=rng)
    def test_DownsampleFactorMax_grad_st(self):
@@ -372,9 +372,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                                                                'average_exc_pad'],
                                                               stridesizes):
            def mp(input):
-                return DownsampleFactorMax(maxpoolshp,
+                return Pool(maxpoolshp,
-                                           ignore_border=ignore_border,
+                            ignore_border=ignore_border,
-                                           st=stride, mode=mode)(input)
+                            st=stride, mode=mode)(input)
            utt.verify_grad(mp, [imval], rng=rng)
    def test_DownsampleFactorMax_grad_st_extra(self):
@@ -395,10 +395,10 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                maxpoolshp = maxpoolshps[indx]
                for ignore_border in [True, False]:
                    def mp(input):
-                        return DownsampleFactorMax(maxpoolshp,
+                        return Pool(maxpoolshp,
-                                                   ignore_border=ignore_border,
+                                    ignore_border=ignore_border,
-                                                   st=stride,
+                                    st=stride,
-                                                   mode=mode)(input)
+                                    mode=mode)(input)
                    utt.verify_grad(mp, [imval], rng=rng)
    def test_DownsampleFactorMaxGrad_grad(self):
@@ -412,12 +412,12 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border
                # The shape of the gradient will be the shape of the output
-                grad_shape = DownsampleFactorMax.out_shape(
+                grad_shape = Pool.out_shape(
                    imval.shape, maxpoolshp, ignore_border=ignore_border)
                grad_val = rng.rand(*grad_shape) * 10.0
                def mp(input, grad):
-                    out = DownsampleFactorMax(
+                    out = Pool(
                        maxpoolshp, ignore_border=ignore_border)(input)
                    grad_op = MaxPoolGrad(
                        maxpoolshp, ignore_border=ignore_border)
@@ -437,7 +437,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    # print 'maxpoolshp =', maxpoolshp
                    # print 'ignore_border =', ignore_border
                    # The shape of the gradient will be the shape of the output
-                    grad_shape = DownsampleFactorMax.out_shape(
+                    grad_shape = Pool.out_shape(
                        imval.shape, avgpoolshp, ignore_border=ignore_border)
                    grad_val = rng.rand(*grad_shape) * 10.0
@@ -459,13 +459,13 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        for maxpoolshp in maxpoolshps:
            for ignore_border in [True, False]:
                for stride in stridesizes:
-                    grad_shape = DownsampleFactorMax.out_shape(
+                    grad_shape = Pool.out_shape(
                        imval.shape, maxpoolshp,
                        ignore_border=ignore_border, st=stride)
                    grad_val = rng.rand(*grad_shape)
                    def mp(input, grad):
-                        out = DownsampleFactorMax(
+                        out = Pool(
                            maxpoolshp, ignore_border=ignore_border,
                            st=stride)(input)
                        grad_op = MaxPoolGrad(
@@ -487,7 +487,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            for ignore_border in [True, False]:
                for mode in ['sum', 'average_inc_pad', 'average_exc_pad']:
                    for stride in stridesizes:
-                        grad_shape = DownsampleFactorMax.out_shape(
+                        grad_shape = Pool.out_shape(
                            imval.shape, avgpoolshp,
                            ignore_border=ignore_border, st=stride)
                        grad_val = rng.rand(*grad_shape)
@@ -516,13 +516,13 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            stride = stridesizes[indx]
            maxpoolshp = maxpoolshps[indx]
            for ignore_border in [True, False]:
-                grad_shape = DownsampleFactorMax.out_shape(
+                grad_shape = Pool.out_shape(
                    imval.shape, maxpoolshp,
                    ignore_border=ignore_border, st=stride)
                grad_val = rng.rand(*grad_shape)
                def mp(input, grad):
-                    out = DownsampleFactorMax(
+                    out = Pool(
                        maxpoolshp, ignore_border=ignore_border,
                        st=stride)(input)
                    grad_op = MaxPoolGrad(
@@ -552,7 +552,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            avgpoolshp = avgpoolshps[indx]
            for ignore_border in [True, False]:
                for mode in ['sum', 'average_inc_pad', 'average_exc_pad']:
-                    grad_shape = DownsampleFactorMax.out_shape(
+                    grad_shape = Pool.out_shape(
                        imval.shape, avgpoolshp,
                        ignore_border=ignore_border, st=stride)
                    grad_val = rng.rand(*grad_shape)
@@ -582,14 +582,14 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            stridesize = stridesizes[i]
            paddingsize = paddingsizes[i]
-            grad_shape = DownsampleFactorMax.out_shape(imval.shape,
+            grad_shape = Pool.out_shape(imval.shape,
-                                                       maxpoolsize, st=stridesize,
+                                        maxpoolsize, st=stridesize,
-                                                       ignore_border=True,
+                                        ignore_border=True,
-                                                       padding=paddingsize)
+                                        padding=paddingsize)
            grad_val = rng.rand(*grad_shape) * 10.0
            def mp(input, grad):
-                out = DownsampleFactorMax(
+                out = Pool(
                    maxpoolsize, ignore_border=True,
                    st=stridesize,
                    padding=paddingsize,
@@ -615,9 +615,9 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
            # 'average_exc_pad' with non-zero padding is not implemented
            for mode in ['sum', 'average_inc_pad']:
-                grad_shape = DownsampleFactorMax.out_shape(imval.shape,
+                grad_shape = Pool.out_shape(imval.shape,
-                                                           avgpoolsize, st=stridesize,
+                                            avgpoolsize, st=stridesize,
-                                                           ignore_border=True, padding=paddingsize)
+                                            ignore_border=True, padding=paddingsize)
                grad_val = rng.rand(*grad_shape) * 10.0
                def mp(input, grad):
@@ -633,7 +633,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        x_vec = tensor.vector('x')
        z = tensor.dot(x_vec.dimshuffle(0, 'x'),
                       x_vec.dimshuffle('x', 0))
-        y = max_pool_2d(input=z, ds=(2, 2), ignore_border=True)
+        y = pool_2d(input=z, ds=(2, 2), ignore_border=True)
        C = tensor.exp(tensor.sum(y))
        grad_hess = tensor.hessian(cost=C, wrt=x_vec)
@@ -642,7 +642,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        # The value has been manually computed from the theoretical gradient,
        # and confirmed by the implementation.
-        assert numpy.allclose(fn_hess( [1, 2]), [[0., 0.], [0., 982.7667]])
+        assert numpy.allclose(fn_hess([1, 2]), [[0., 0.], [0., 982.7667]])
    def test_max_pool_2d_2D(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
@@ -660,16 +660,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
                                                          ignore_border,
                                                          mode=mode)
-                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                output = pool_2d(images, maxpoolshp, ignore_border,
-                                     mode=mode)
+                                 mode=mode)
                output_val = function([images], output)(imval)
                assert numpy.all(output_val == numpy_output_val), (
                    "output_val is %s, numpy_output_val is %s"
                    % (output_val, numpy_output_val))
                def mp(input):
-                    return max_pool_2d(input, maxpoolshp, ignore_border,
+                    return pool_2d(input, maxpoolshp, ignore_border,
-                                       mode=mode)
+                                   mode=mode)
                utt.verify_grad(mp, [imval], rng=rng)
    def test_max_pool_2d_2D_same_size(self):
@@ -713,8 +713,8 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
                                                          ignore_border,
                                                          mode)
-                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                output = pool_2d(images, maxpoolshp, ignore_border,
-                                     mode=mode)
+                                 mode=mode)
                output_val = function([images], output)(imval)
                assert numpy.all(output_val == numpy_output_val), (
                    "output_val is %s, numpy_output_val is %s"
@@ -723,7 +723,7 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
 # removed as already tested in test_max_pool_2d_2D
 # This make test in debug mode too slow.
 #                def mp(input):
-#                    return max_pool_2d(input, maxpoolshp, ignore_border)
+#                    return pool_2d(input, maxpoolshp, ignore_border)
 #                utt.verify_grad(mp, [imval], rng=rng)
    def test_max_pool_2d_6D(self):
@@ -742,15 +742,15 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
                                                          ignore_border,
                                                          mode=mode)
-                output = max_pool_2d(images, maxpoolshp, ignore_border,
+                output = pool_2d(images, maxpoolshp, ignore_border,
-                                     mode=mode)
+                                 mode=mode)
                output_val = function([images], output)(imval)
                assert numpy.all(output_val == numpy_output_val)
 # removed as already tested in test_max_pool_2d_2D
 # This make test in debug mode too slow.
 #                def mp(input):
-#                    return max_pool_2d(input, maxpoolshp, ignore_border)
+#                    return pool_2d(input, maxpoolshp, ignore_border)
 #                utt.verify_grad(mp, [imval], rng=rng)
    def test_infer_shape(self):
@@ -782,12 +782,12 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                for k, padding in enumerate([(0, 0), (1, 1), (1, 2)]):
                    if out_shapes[k][i][j] is None:
                        continue
-                    # checking shapes generated by DownsampleFactorMax
+                    # checking shapes generated by Pool
                    self._compile_and_check([image],
-                                            [DownsampleFactorMax(maxpoolshp,
+                                            [Pool(maxpoolshp,
-                                                                 ignore_border=ignore_border,
+                                                  ignore_border=ignore_border,
-                                                                 padding=padding)(image)],
+                                                  padding=padding)(image)],
-                                            [image_val], DownsampleFactorMax)
+                                            [image_val], Pool)
                    # checking shapes generated by MaxPoolGrad
                    maxout_val = rng.rand(*out_shapes[k][i][j])
@@ -806,10 +806,10 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        image_val = rng.rand(4, 6, 1, 1)
        self._compile_and_check(
            [image],
-            [DownsampleFactorMax((2, 2),
+            [Pool((2, 2),
-                                 ignore_border=True,
+                  ignore_border=True,
-                                 padding=(0, 0))(image)],
+                  padding=(0, 0))(image)],
-            [image_val], DownsampleFactorMax)
+            [image_val], Pool)
    def test_opt_max_to_average(self):
        im = theano.tensor.tensor4()

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -79,7 +79,7 @@ whitelist_flake8 = [
    "tensor/tests/test_blas_c.py",
    "tensor/tests/test_blas_scipy.py",
    "tensor/tests/test_mpi.py",
-    "tensor/signal/downsample.py",
+    "tensor/signal/pool.py",
    "tensor/signal/conv.py",
    "tensor/signal/tests/test_conv.py",
    "tensor/signal/tests/test_downsample.py",

--- a/theano/tests/test_rop.py
+++ b/theano/tests/test_rop.py
@@ -21,7 +21,7 @@ import numpy
 from theano.gof import Op, Apply
 from theano.gradient import grad_undefined
 from theano.tests.unittest_tools import SkipTest
-from theano.tensor.signal.downsample import DownsampleFactorMax
+from theano.tensor.signal.pool import Pool
 from theano.tensor.nnet import conv
 '''