Merge pull request #5198 from aam-at/gpu_pool

Gpuarray average and max pooling

Merge pull request #5198 from aam-at/gpu_pool
a7900bd8 · Frédéric Bastien · GitHub · 477b915b · 333de734 · a7900bd8
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -66,7 +66,7 @@ from theano.gof.opt import (
    OpKeyOptimizer)

 from theano.gof.optdb import \
-    DB, Query, \
+    DB, LocalGroupDB, Query, \
    EquilibriumDB, SequenceDB, ProxyDB

 from theano.gof.toolbox import \

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1384,6 +1384,41 @@ class LocalOptGroup(LocalOptimizer):
            opt.add_requirements(fgraph)


+class GraphToGPULocalOptGroup(LocalOptGroup):
+    """This is the equivalent of LocalOptGroup for GraphToGPU.
+
+    The main different is the function signature of the local
+    optimizer that use the GraphToGPU signature and not the normal
+    LocalOptimizer signature.
+
+    apply_all_opts=True is not supported
+
+    """
+    def __init__(self, *optimizers, **kwargs):
+        super(GraphToGPULocalOptGroup, self).__init__(*optimizers, **kwargs)
+        assert self.apply_all_opts is False
+
+    def transform(self, op, context_name, inputs, outputs):
+        if len(self.opts) == 0:
+            return
+        fgraph = outputs[0].fgraph
+        opts = self.track_map[type(op)] + self.track_map[op] + self.track_map[None]
+        for opt in opts:
+            opt_start = time.time()
+            new_repl = opt.transform(op, context_name, inputs, outputs)
+            opt_finish = time.time()
+            if self.profile:
+                self.time_opts[opt] += opt_start - opt_finish
+                self.process_count[opt] += 1
+            if not new_repl:
+                continue
+            if self.profile:
+                self.node_created[opt] += len(graph.ops(fgraph.variables, new_repl))
+                self.applied_true[opt] += 1
+
+            return new_repl
+
+
 class OpSub(LocalOptimizer):
    """


--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -405,12 +405,14 @@ class LocalGroupDB(DB):

    """

-    def __init__(self, apply_all_opts=False, profile=False):
+    def __init__(self, apply_all_opts=False, profile=False,
+                 local_opt=opt.LocalOptGroup):
        super(LocalGroupDB, self).__init__()
        self.failure_callback = None
        self.apply_all_opts = apply_all_opts
        self.profile = profile
        self.__position__ = {}
+        self.local_opt = local_opt

    def register(self, name, obj, *tags, **kwargs):
        super(LocalGroupDB, self).register(name, obj, *tags)
@@ -429,7 +431,7 @@ class LocalGroupDB(DB):
        opts = list(super(LocalGroupDB, self).query(*tags, **kwtags))
        opts.sort(key=lambda obj: (self.__position__[obj.name], obj.name))

-        ret = opt.LocalOptGroup(*opts,
+        ret = self.local_opt(*opts,
                             apply_all_opts=self.apply_all_opts,
                             profile=self.profile)
        return ret

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -1550,52 +1550,6 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth


-class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
-    """
-    Implement the grad of downsample with max on the gpu.
-
-    """
-    __props__ = ('ignore_border', 'mode', 'ndim')
-
-    def __init__(self, ignore_border, mode='max', ndim=2):
-        self.ndim = ndim
-        self.ignore_border = ignore_border
-        self.mode = mode
-        CGpuKernelBase.__init__(self, ['pool_grad_grad.c'],
-                                'APPLY_SPECIFIC(pool_grad_grad)')
-        assert self.mode == 'max'
-        assert self.ndim in [2, 3]
-
-    def c_headers(self):
-        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
-
-    def c_header_dirs(self):
-        return [os.path.dirname(__file__), pygpu.get_include()]
-
-    def make_node(self, inp, out, out_grad, ws, stride, pad):
-        ctx_name = infer_context_name(inp, out, out_grad)
-        inp = as_gpuarray_variable(inp, ctx_name)
-        assert (inp.ndim in [4, 5])
-        out = as_gpuarray_variable(out, ctx_name)
-        assert (out_grad.ndim in [4, 5])
-        out_grad = as_gpuarray_variable(out_grad, ctx_name)
-        assert(out.ndim in [4, 5])
-
-        assert (out_grad.ndim == inp.ndim)
-        assert (inp.ndim == out.ndim)
-
-        ws = as_tensor_variable(ws)
-        stride = as_tensor_variable(stride)
-        pad = as_tensor_variable(pad)
-        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
-        assert ws.type.ndim == 1
-
-        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
-
-    def get_params(self, node):
-        return node.inputs[0].type.context
-
-
 @inplace_allocempty(GpuGemv, 0)
 def local_inplace_gpuagemv(node, inputs):
    return [gpugemv_inplace(*inputs)]

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -38,7 +38,7 @@ from .elemwise import GpuElemwise
 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
-from .opt import (gpu_seqopt, register_opt,
+from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
                  op_lifter, register_opt2)

 from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
@@ -2736,9 +2736,6 @@ def local_dnn_convi_output_merge(node, *inputs):
    return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]


-@register_opt('cudnn', 'fast_compile')
-@op_lifter([Pool])
-@register_opt2([Pool], 'fast_compile', 'cudnn')
 def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
@@ -2758,11 +2755,16 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
        img_padded = pad_dims(img, 2, nd)
        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
        return unpad_dims(ret_padded, img, 2, nd)
+pool_db.register("local_gpua_pool_dnn_alternative",
+                 op_lifter([Pool])(local_gpua_pool_dnn_alternative),
+                 'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
+                 position=0)
+pool_db2.register("local_gpua_pool_dnn_alternative",
+                  local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
+                  'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
+                  position=0)


-@register_opt('cudnn', 'fast_compile')
-@op_lifter([MaxPoolGrad])
-@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
 def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
@@ -2797,11 +2799,16 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
                                               stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
+pool_db.register("local_gpua_pool_dnn_grad_stride",
+                 op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
+                 'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
+                 position=0)
+pool_db2.register("local_gpua_pool_dnn_grad_stride",
+                  local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
+                  'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
+                  position=0)


-@register_opt('cudnn', 'fast_compile')
-@op_lifter([AveragePoolGrad])
-@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
 def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        raise_no_cudnn()
@@ -2832,6 +2839,14 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
                                               stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
+pool_db.register("local_gpua_avg_pool_dnn_grad_stride",
+                 op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
+                 'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
+                 position=0)
+pool_db2.register("local_gpua_avg_pool_dnn_grad_stride",
+                  local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
+                  'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
+                  position=0)


 @register_opt('cudnn', 'fast_compile')

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -13,6 +13,7 @@ from theano import tensor, scalar, gof, config
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
+                        LocalGroupDB,
                        SequenceDB, Optimizer, DB, toolbox, graph)
 from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet
@@ -47,7 +48,8 @@ from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemmbatch_no_inplace,
                   gpugemv_no_inplace, gpugemv_inplace,
                   GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
-                   GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights,
+                   GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
+from .pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad,
                   GpuDownsampleFactorMaxGradGrad)
 from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
                          gpu_sparse_block_outer,
@@ -129,6 +131,9 @@ def register_opt2(tracks, *tags, **kwargs):
    '''
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        if isinstance(local_opt, theano.gof.DB):
+            opt = local_opt
+        else:
            opt = theano.gof.local_optimizer(tracks)(local_opt)
        gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
        return local_opt
@@ -1594,6 +1599,99 @@ def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs):
    return [op(*inps)]


+def local_gpu_pool(op, ctx_name, inputs, outputs):
+    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
+    inp, ws, stride, pad = inputs
+    nd = op.ndim
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+
+    op = GpuPool(op.ignore_border, op.mode, op.ndim)
+    if inp.ndim == nd + 2:
+        return op(inp, ws, stride, pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        ret_padded = op(inp_padded, ws, stride, pad)
+        return unpad_dims(ret_padded, inp, 2, nd)
+pool_db = LocalGroupDB()
+pool_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
+pool_db2.__name__ = "pool_db2"
+lifter = op_lifter([pool.Pool])(local_gpu_pool)
+pool_db.register("local_gpu_pool", lifter,
+                 'gpuarray', 'fast_compile', 'fast_run',
+                 position=1)
+pool_db2.register("local_gpu_pool",
+                  local_optimizer([pool.Pool])(local_gpu_pool),
+                  'gpuarray', 'fast_compile', 'fast_run',
+                  position=1)
+register_opt('fast_compile', name='pool_db')(pool_db)
+register_opt2([pool.Pool], 'fast_compile', name='pool_db2')(pool_db2)
+
+
+def local_gpu_max_pool_grad(op, ctx_name, inputs, outputs):
+    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
+
+    inp, out, out_grad, ws, stride, pad = inputs
+    nd = op.ndim
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
+
+    op = GpuMaxPoolGrad(op.ignore_border, op.mode, op.ndim)
+    if inp.ndim == nd + 2:
+        return op(inp, out, out_grad, ws, stride, pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_padded = pad_dims(out, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = op(inp_padded, out_padded, out_grad_padded,
+                        ws, stride, pad)
+        return unpad_dims(ret_padded, inp, 2, nd)
+lifter = op_lifter([pool.MaxPoolGrad])(local_gpu_max_pool_grad)
+pool_db.register("local_gpu_max_pool_grad", lifter,
+                 'gpuarray', 'fast_compile', 'fast_run',
+                 position=1)
+pool_db2.register("local_gpu_max_pool_grad",
+                  local_optimizer([pool.MaxPoolGrad])(local_gpu_max_pool_grad),
+                  'gpuarray', 'fast_compile', 'fast_run',
+                  position=1)
+
+
+def local_gpu_average_pool_grad(op, ctx_name, inputs, outputs):
+    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
+
+    inp, out_grad, ws, stride, pad = inputs
+    nd = op.ndim
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
+
+    op = GpuAveragePoolGrad(op.ignore_border, op.mode, op.ndim)
+    if inp.ndim == nd + 2:
+        return op(inp, out_grad, ws, stride, pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = op(inp_padded, out_grad_padded,
+                        ws, stride, pad)
+        return unpad_dims(ret_padded, inp, 2, nd)
+lifter = op_lifter([pool.AveragePoolGrad])(local_gpu_average_pool_grad)
+pool_db.register("local_gpu_average_pool_grad", lifter,
+                 'gpuarray', 'fast_compile', 'fast_run',
+                 position=1)
+pool_db2.register("local_gpu_average_pool_grad",
+                  local_optimizer([pool.AveragePoolGrad])(local_gpu_average_pool_grad),
+                  'gpuarray', 'fast_compile', 'fast_run',
+                  position=1)
+
+
 @register_opt()
 @op_lifter([pool.DownsampleFactorMaxGradGrad])
 @register_opt2([pool.DownsampleFactorMaxGradGrad])

--- a/theano/gpuarray/pool.c
+++ b/theano/gpuarray/pool.c
--- a/theano/gpuarray/pool.py
+++ b/theano/gpuarray/pool.py
--- a/theano/gpuarray/pool_ave_grad.c
+++ b/theano/gpuarray/pool_ave_grad.c
+#section kernels
+
+#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, * :
+
+// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size height,
+   const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *gz,
+   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
+   GLOBAL_MEM DTYPE_o0 *gx)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads; index += LDIM_0 * GDIM_0) {
+    const ga_size w = index % width;
+    const ga_size h = (index / width) % height;
+    const ga_size c = (index / width / height) % channels;
+    const ga_size n = (index / width / height / channels);
+    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+
+    const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
+    const DTYPE_i1* gz_slice = gz + offset;
+    DTYPE_o0 collector = 0;
+
+    for (ga_size ph=phstart; ph < phend; ++ph) {
+      for (ga_size pw=pwstart; pw < pwend; ++pw) {
+        if (sum_mode) {
+          collector += gz[ph*pooled_width + pw];
+        } else {
+          // figure out the pooling size
+          const ga_size hstart = ph * stride_h - pad_h;
+          const ga_size wstart = pw * stride_w - pad_w;
+          const ga_size hend = min(hstart + kernel_h, height + pad_h);
+          const ga_size wend = min(wstart + kernel_w, width + pad_w);
+          const ga_size pool_size = (hend - hstart) * (wend - wstart);
+          collector += gz_slice[ph*pooled_width + pw] / pool_size;
+        }
+      }
+    }
+    gx[index] = collector;
+  }
+}
+
+#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, size, size, * :
+
+// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size depth,
+   const ga_size height, const ga_size width, const ga_size pooled_depth,
+   const ga_size pooled_height, const ga_size pooled_width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *gz,
+   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
+   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
+   const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_o0 *gx)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads; index += LDIM_0 * GDIM_0) {
+    const ga_size w = index % width;
+    const ga_size h = (index / width) % height;
+    const ga_size d = (index / width / height) % depth;
+    const ga_size c = (index / width / height / depth) % channels;
+    const ga_size n = (index / width / height / depth / channels);
+    const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
+    const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
+    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+
+    const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
+    const DTYPE_i1* gz_slice = gz + offset;
+    DTYPE_o0 collector = 0;
+
+    for (ga_size pd=pdstart; pd < pdend; ++pd) {
+      for (ga_size ph=phstart; ph < phend; ++ph) {
+        for (ga_size pw=pwstart; pw < pwend; ++pw) {
+          if (sum_mode) {
+            collector += gz[ph*pooled_width + pw];
+          } else {
+            // figure out the pooling size
+            const ga_size dstart = pd * stride_d - pad_d;
+            const ga_size hstart = ph * stride_h - pad_h;
+            const ga_size wstart = pw * stride_w - pad_w;
+            const ga_size dend = min(dstart + kernel_d, depth + pad_d);
+            const ga_size hend = min(hstart + kernel_h, height + pad_h);
+            const ga_size wend = min(wstart + kernel_w, width + pad_w);
+            const ga_size pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            collector += gz[ph*pooled_width + pw] / pool_size;
+          }
+        }
+      }
+    }
+    gx[index] = collector;
+  }
+}
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
+                                  PyGpuArrayObject *gz,
+                                  PyArrayObject *ws,
+                                  PyArrayObject *stride,
+                                  PyArrayObject *pad,
+                                  PyGpuArrayObject **gx,
+                                  PyGpuContextObject *ctx) {
+  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
+      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
+    {
+      PyErr_Format(PyExc_ValueError,
+                   "GpuMaxPoolGrad: requires data to be C-contiguous");
+      return 1;
+    }
+  size_t ndims = PyArray_DIM(ws, 0);
+  if (PyGpuArray_NDIM(x) != ndims + 2
+      || PyGpuArray_NDIM(gz) != ndims + 2)
+    {
+      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
+      return 1;
+    }
+  if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
+                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
+    {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMaxPoolGrad: failed to allocate memory");
+      return 1;
+    }
+
+  {
+    // scope for running kernel
+    size_t w[3];
+    size_t s[3];
+    size_t p[3];
+    for(int i = 0; i < ndims; i++) {
+      w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+      s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+    }
+
+    int err;
+    const size_t* z_dims = PyGpuArray_DIMS(gz);
+    const size_t* x_dims = PyGpuArray_DIMS(x);
+
+    if (ndims == 2) {
+      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
+      err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
+                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
+                                         z_dims[2], z_dims[3],
+                                         x->ga.data, gz->ga.data,
+                                         w[0], w[1], s[0], s[1], p[0], p[1],
+                                         INC_PAD, SUM_MODE, (*gx)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
+                     GpuKernel_error(&k_ave_pool2d_grad_kernel, err));
+        return 1;
+      }
+    } else if (ndims == 3) {
+      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
+      err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
+                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
+                                         z_dims[2], z_dims[3], z_dims[4],
+                                         x->ga.data, gz->ga.data,
+                                         w[0], w[1], w[2], s[0], s[1], s[2],
+                                         p[0], p[1], p[2], INC_PAD, SUM_MODE,
+                                         (*gx)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
+                     GpuKernel_error(&k_ave_pool3d_grad_kernel, err));
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
--- a/theano/gpuarray/pool_grad_grad.c
+++ b/theano/gpuarray/pool_grad_grad.c
@@ -18,11 +18,11 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
    const ga_size c = (index / pooled_width / pooled_height) % channels;
    const ga_size n = (index / pooled_width / pooled_height / channels);
    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
-    hstart = max(hstart, 0);
    const ga_size hend = min(hstart + kernel_h, height);
    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
-    wstart = max(wstart, 0);
    const ga_size wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);

    const ga_size offset = (n*channels + c) * height * width;

@@ -63,14 +63,14 @@ KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
    ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
-    dstart = max(dstart, 0);
    const ga_size dend = min(dstart + kernel_d, depth);
    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
-    hstart = max(hstart, 0);
    const ga_size hend = min(hstart + kernel_h, height);
    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
-    wstart = max(wstart, 0);
    const ga_size wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);

    const ga_size offset = (n*channels + c) * depth * height * width;

@@ -137,24 +137,13 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
    }

-    size_t max_threads_dim;
    int err;
    const size_t* z_dims = PyGpuArray_DIMS(z);
    const size_t* x_dims = PyGpuArray_DIMS(x);

-    // Get the max threads per blocks
-    err = gpucontext_property(ctx->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
-    if (err != GA_NO_ERROR){
-      PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
-      return 1;
-    }
-    size_t threads_per_block = max_threads_dim;
-
    if (ndims == 2) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
-      size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
-      err = max_pool2d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
-                                             num_kernels,
+      err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                              z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                              x_dims[2], x_dims[3],
                                              x->ga.data, z->ga.data, gx->ga.data,
@@ -169,9 +158,7 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
    }
    else if (ndims == 3) {
      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
-      size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
-      err = max_pool3d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
-                                             num_kernels,
+      err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                              z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                              x_dims[2], x_dims[3], x_dims[4],
                                              x->ga.data, z->ga.data, gx->ga.data,

--- a/theano/gpuarray/pool_max_grad.c
+++ b/theano/gpuarray/pool_max_grad.c
+#section kernels
+
+#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * :
+
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size height,
+   const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gz,
+   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_o0 *gx)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads; index += LDIM_0 * GDIM_0) {
+    const ga_size w = index % width;
+    const ga_size h = (index / width) % height;
+    const ga_size c = (index / width / height) % channels;
+    const ga_size n = (index / width / height / channels);
+    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+
+    const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
+    const DTYPE_i1* z_slice = z + offset;
+    const DTYPE_i2* gz_slice = gz + offset;
+    DTYPE_o0 gradient = 0;
+
+    for (ga_size ph=phstart; ph < phend; ++ph) {
+      for (ga_size pw=pwstart; pw < pwend; ++pw) {
+        if (x[index] == z_slice[ph * pooled_width + pw]) {
+          gradient += gz_slice[ph * pooled_width + pw];
+        }
+      }
+    }
+    gx[index] = gradient;
+  }
+}
+
+#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * :
+
+// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size depth,
+   const ga_size height, const ga_size width, const ga_size pooled_depth,
+   const ga_size pooled_height, const ga_size pooled_width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gz,
+   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
+   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
+   GLOBAL_MEM DTYPE_o0 *gx)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads; index += LDIM_0 * GDIM_0) {
+    const ga_size w = index % width;
+    const ga_size h = (index / width) % height;
+    const ga_size d = (index / width / height) % depth;
+    const ga_size c = (index / width / height / depth) % channels;
+    const ga_size n = (index / width / height / depth / channels);
+    const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
+    const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
+    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+
+    const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
+    const DTYPE_i1* z_slice = z + offset;
+    const DTYPE_i2* gz_slice = gz + offset;
+    DTYPE_o0 gradient = 0;
+
+    for (ga_size pd=pdstart; pd < pdend; ++pd) {
+      for (ga_size ph=phstart; ph < phend; ++ph) {
+        for (ga_size pw=pwstart; pw < pwend; ++pw) {
+          if (x[index] == z_slice[(pd * pooled_height + ph) * pooled_width + pw]) {
+            gradient += gz_slice[(pd * pooled_height + ph) * pooled_width + pw];
+          }
+        }
+      }
+    }
+    gx[index] = gradient;
+  }
+}
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
+                                  PyGpuArrayObject *z,
+                                  PyGpuArrayObject *gz,
+                                  PyArrayObject *ws,
+                                  PyArrayObject *stride,
+                                  PyArrayObject *pad,
+                                  PyGpuArrayObject **gx,
+                                  PyGpuContextObject *ctx) {
+  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
+      || !GpuArray_IS_C_CONTIGUOUS(&z->ga)
+      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
+    {
+      PyErr_Format(PyExc_ValueError,
+                   "GpuMaxPoolGrad: requires data to be C-contiguous");
+      return 1;
+    }
+  size_t ndims = PyArray_DIM(ws, 0);
+  if (PyGpuArray_NDIM(x) != ndims + 2
+      || PyGpuArray_NDIM(z) != ndims + 2
+      || PyGpuArray_NDIM(gz) != ndims + 2)
+    {
+      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
+      return 1;
+    }
+  if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
+                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
+    {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMaxPoolGrad: failed to allocate memory");
+      return 1;
+    }
+
+  {
+    // scope for running kernel
+    size_t w[3];
+    size_t s[3];
+    size_t p[3];
+    for(int i = 0; i < ndims; i++) {
+      w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+      s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+    }
+
+    int err;
+    const size_t* z_dims = PyGpuArray_DIMS(z);
+    const size_t* x_dims = PyGpuArray_DIMS(x);
+
+    if (ndims == 2) {
+      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
+      err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
+                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
+                                         z_dims[2], z_dims[3],
+                                         x->ga.data, z->ga.data, gz->ga.data,
+                                         w[0], w[1], s[0], s[1], p[0], p[1],
+                                         (*gx)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
+                     GpuKernel_error(&k_max_pool2d_grad_kernel, err));
+        return 1;
+      }
+    } else if (ndims == 3) {
+      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
+      err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
+                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
+                                         z_dims[2], z_dims[3], z_dims[4],
+                                         x->ga.data, z->ga.data, gz->ga.data,
+                                         w[0], w[1], w[2], s[0], s[1], s[2],
+                                         p[0], p[1], p[2], (*gx)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
+                     GpuKernel_error(&k_max_pool3d_grad_kernel, err));
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
--- a/theano/gpuarray/tests/test_blas.py
+++ b/theano/gpuarray/tests/test_blas.py
@@ -2,26 +2,21 @@ from __future__ import absolute_import, print_function, division
 from unittest import TestCase
 from nose.plugins.skip import SkipTest
 import itertools
-import copy
-
-import numpy

 import theano
-from theano import gradient
 from theano import tensor
 from theano.tests import unittest_tools as utt
 from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22, batched_dot
 from theano.tensor.tests.test_blas import TestGer, BaseGemv
-from theano.tensor.signal.pool import Pool, DownsampleFactorMaxGradGrad

 from .. import gpuarray_shared_constructor
-from .config import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu
 from .test_basic_ops import makeTester, rand

 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace, gpugemmbatch_no_inplace,
                    gpuger_inplace, gpuger_no_inplace,
-                    GpuGer, gpu_dot22, GpuDownsampleFactorMaxGradGrad)
+                    GpuGer, gpu_dot22)


 GpuGemvTester = makeTester(
@@ -133,129 +128,3 @@ GpuDot22Tester = makeTester(
        # test9=[rand(0, 0), rand(0, 0)],
    )
 )
-
-
-def test_max_pool2d_grad_grad():
-    shps = [(1, 12),
-            (1, 1, 12),
-            (1, 1, 1, 12),
-            (1, 1, 2, 2),
-            (1, 1, 1, 1),
-            (1, 1, 4, 4),
-            (1, 1, 10, 11),
-            (1, 2, 2, 2),
-            (3, 5, 4, 4),
-            (25, 1, 7, 7),
-            (1, 1, 12, 12),
-            (1, 1, 2, 14),
-            (1, 1, 12, 14),
-            (1, 1, 14, 14),
-            (1, 1, 16, 16),
-            (1, 1, 18, 18),
-            (1, 1, 24, 24),
-            (1, 6, 24, 24),
-            (10, 1, 24, 24),
-            (10, 6, 24, 24),
-            (30, 6, 12, 12),
-            (30, 2, 24, 24),
-            (30, 6, 24, 24),
-            (10, 10, 10, 11),
-            (1, 1, 10, 1025),
-            (1, 1, 10, 1023),
-            (1, 1, 1025, 10),
-            (1, 1, 1023, 10), ]
-
-    numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
-    test_ds = (2, 2), (3, 2), (1, 1)
-    test_st = (2, 2), (3, 2), (1, 1)
-
-    for shp in shps:
-        for ds, st in itertools.product(test_ds, test_st):
-            if ds[0] > shp[-2] or ds[1] > shp[-1]:
-                continue
-            for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]):
-                if pad[0] >= ds[0] or pad[1] >= ds[1]:
-                    continue
-                # print('test_downsample', shp, ds, st, pad, ignore_border)
-                ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
-
-                a = theano.shared(rand(*shp), 'a')
-
-                ggf = gradient.Lop(tensor.grad((ds_op(
-                    tensor.as_tensor_variable(a), ds, st, pad)**2).sum(), a), a, a)
-
-                ref_mode = copy.copy(mode_without_gpu)
-                ref_mode.check_py_code = False
-                gpu_mode = copy.copy(mode_with_gpu)
-                gpu_mode.check_py_code = False
-                gg = theano.function([], ggf, mode=gpu_mode)
-                gg2 = theano.function([], ggf, mode=ref_mode)
-
-                assert any([
-                    isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
-                    for node in gg.maker.fgraph.toposort()
-                ])
-                assert any([
-                    isinstance(node.op, DownsampleFactorMaxGradGrad)
-                    for node in gg2.maker.fgraph.toposort()
-                ])
-                assert numpy.allclose(gg(), gg2()), (shp, ds, st,
-                                                     ignore_border)
-
-
-def test_max_pool3d_grad_grad():
-    shps = [(1, 1, 12),
-            (1, 1, 1, 1, 1),
-            (1, 1, 1, 1, 1025),
-            (1, 1, 2, 2, 2),
-            (1, 1, 7, 7, 7),
-            (1, 1, 9, 10, 11),
-            (1, 6, 18, 18, 18),
-            (1, 1, 6, 24, 24),
-            (1, 10, 1, 24, 24),
-            (1, 10, 6, 24, 24),
-            (1, 30, 6, 12, 12),
-            (1, 30, 2, 24, 24),
-            (1, 30, 6, 24, 24),
-            (1, 10, 10, 10, 11),
-            (1, 1, 10, 10, 1025),
-            (1, 1, 10, 10, 1023),
-            (1, 1, 10, 1025, 10),
-            (1, 1, 10, 1023, 10), ]
-
-    numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
-    test_ds = (2, 2, 2), (3, 2, 3), (1, 1, 1)
-    test_st = (2, 2, 2), (2, 3, 2), (1, 1, 1)
-
-    for shp in shps:
-        for ds, st in itertools.product(test_ds, test_st):
-            if ds[0] > shp[-3] or ds[1] > shp[-2] or ds[2] > shp[-1]:
-                continue
-            for ignore_border, pad in zip((True, False), [(1, 1, 1), (0, 0, 0)]):
-                if pad[0] >= ds[0] or pad[1] >= ds[1] or pad[2] >= ds[2]:
-                    continue
-                # print('test_downsample', shp, ds, st, pad, ignore_border)
-                ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
-
-                a = theano.shared(rand(*shp), 'a')
-
-                ggf = gradient.Lop(tensor.grad((ds_op(
-                    tensor.as_tensor_variable(a), ds, st, pad)**2).sum(), a), a, a)
-
-                ref_mode = copy.copy(mode_without_gpu)
-                ref_mode.check_py_code = False
-                gpu_mode = copy.copy(mode_with_gpu)
-                gpu_mode.check_py_code = False
-                gg = theano.function([], ggf, mode=gpu_mode)
-                gg2 = theano.function([], ggf, mode=ref_mode)
-
-                assert any([
-                    isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
-                    for node in gg.maker.fgraph.toposort()
-                ])
-                assert any([
-                    isinstance(node.op, DownsampleFactorMaxGradGrad)
-                    for node in gg2.maker.fgraph.toposort()
-                ])
-                assert numpy.allclose(gg(), gg2()), (shp, ds, st,
-                                                     ignore_border)
--- a/theano/gpuarray/tests/test_pool.py
+++ b/theano/gpuarray/tests/test_pool.py
+from __future__ import absolute_import, print_function, division
+import unittest
+
+import copy
+import itertools
+
+import numpy
+import theano
+from theano import gradient
+from theano import tensor
+from theano.tensor.signal.pool import (Pool, MaxPoolGrad, AveragePoolGrad,
+                                       DownsampleFactorMaxGradGrad)
+from theano.tests import unittest_tools as utt
+
+from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand
+from ..pool import (GpuPool, GpuMaxPoolGrad, GpuAveragePoolGrad,
+                    GpuDownsampleFactorMaxGradGrad)
+
+
+class TestPool(unittest.TestCase):
+
+    def test_pool_py_interface(self):
+        shp = (2, 2, 2, 2)
+        inp = theano.shared(rand(*shp), 'a')
+        inp = tensor.as_tensor_variable(inp)
+        with self.assertRaises(ValueError):
+            # test when pad >= ws
+            ds_op = GpuPool(ignore_border=True, ndim=2)
+            ds_op(inp, [2, 2], pad=[3, 3])
+        with self.assertRaises(ValueError):
+            # test when ignore_border and pad >= 0
+            ds_op = GpuPool(ignore_border=False, ndim=2)
+            ds_op(inp, [2, 2], pad=[1, 1])
+
+    def test_pool_c_interface(self):
+        gpu_mode = copy.copy(mode_with_gpu).excluding("cudnn")
+        gpu_mode.check_py_code = False
+
+        shp = (2, 2, 2, 2)
+        inp = theano.shared(rand(*shp), 'a')
+        inp = tensor.as_tensor_variable(inp)
+        with self.assertRaises(ValueError):
+            # test when ignore_border and pad >= 0
+            ds_op = GpuPool(ignore_border=False, ndim=2)
+            pad = tensor.as_tensor_variable([1, 1])
+            f = theano.function([], ds_op(inp, [2, 2], pad=pad), mode=gpu_mode)
+            f()
+
+
+def test_pool2d():
+    shps = [(1, 12),
+            (1, 1, 12),
+            (1, 1, 1, 12),
+            (1, 1, 2, 2),
+            (1, 1, 1, 1),
+            (1, 1, 4, 4),
+            (1, 1, 10, 11),
+            (1, 2, 2, 2),
+            (3, 5, 4, 4),
+            (25, 1, 7, 7),
+            (1, 1, 12, 12),
+            (1, 1, 2, 14),
+            (1, 1, 12, 14),
+            (1, 1, 14, 14),
+            (1, 1, 16, 16),
+            (1, 1, 18, 18),
+            (1, 1, 24, 24),
+            (1, 6, 24, 24),
+            (10, 1, 24, 24),
+            (10, 6, 24, 24),
+            (30, 6, 12, 12),
+            (30, 2, 24, 24),
+            (30, 6, 24, 24),
+            (10, 10, 10, 11),
+            (1, 1, 10, 1025),
+            (1, 1, 10, 1023),
+            (1, 1, 1025, 10),
+            (1, 1, 1023, 10),
+            (3, 2, 16, 16, 16),
+            (3, 2, 6, 6, 6, 5),
+            (3, 2, 6, 6, 6, 5, 7), ]
+
+    numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
+    test_ws = (2, 2), (3, 2), (1, 1)
+    test_st = (2, 2), (3, 2), (1, 1)
+    test_mode = ['max', 'sum', 'average_inc_pad', 'average_exc_pad']
+
+    ref_mode = copy.copy(mode_without_gpu)
+    ref_mode.check_py_code = False
+    gpu_mode = copy.copy(mode_with_gpu).excluding("cudnn")
+    gpu_mode.check_py_code = False
+
+    for shp in shps:
+        for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
+            if ws[0] > shp[-2] or ws[1] > shp[-1]:
+                continue
+            for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]):
+                if pad[0] >= ws[0] or pad[1] >= ws[1]:
+                    continue
+                if mode == 'average_exc_pad' and (pad[0] > 0 or pad[1] > 0):
+                    continue
+                # print('test_pool2d', shp, ws, st, pad, mode, ignore_border)
+                ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
+
+                a = theano.shared(rand(*shp), 'a')
+                a_pooled = ds_op(tensor.as_tensor_variable(a), ws, st, pad)
+
+                f = theano.function([], a_pooled, mode=gpu_mode)
+                f2 = theano.function([], a_pooled, mode=ref_mode)
+
+                assert any([isinstance(node.op, GpuPool)
+                            for node in f.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, Pool)
+                            for node in f2.maker.fgraph.toposort()])
+                assert numpy.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
+
+                a_pooled_grad = tensor.grad(a_pooled.sum(), a)
+
+                g = theano.function([], a_pooled_grad, mode=gpu_mode)
+                g2 = theano.function([], a_pooled_grad, mode=ref_mode)
+
+                if mode == 'max':
+                    gop = GpuMaxPoolGrad
+                    gop2 = MaxPoolGrad
+                else:
+                    gop = GpuAveragePoolGrad
+                    gop2 = AveragePoolGrad
+                assert any([isinstance(node.op, gop)
+                            for node in g.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, gop2)
+                            for node in g2.maker.fgraph.toposort()])
+
+                assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
+
+                # test grad grad for max pooling
+                # for average pooling grad grad is just average pooling grad
+                if mode != 'max':
+                    continue
+
+                ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
+
+                gg = theano.function([], ggf, mode=gpu_mode)
+                gg2 = theano.function([], ggf, mode=ref_mode)
+
+                assert any([
+                    isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
+                    for node in gg.maker.fgraph.toposort()
+                ])
+                assert any([
+                    isinstance(node.op, DownsampleFactorMaxGradGrad)
+                    for node in gg2.maker.fgraph.toposort()
+                ])
+                assert numpy.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
+
+
+def test_pool3d():
+    shps = [(1, 1, 12),
+            (1, 1, 1, 1, 1),
+            (1, 1, 1, 1, 1025),
+            (1, 1, 2, 2, 2),
+            (1, 1, 7, 7, 7),
+            (1, 1, 9, 10, 11),
+            (1, 6, 18, 18, 18),
+            (1, 1, 6, 24, 24),
+            (1, 10, 1, 24, 24),
+            (1, 10, 6, 24, 24),
+            (1, 30, 6, 12, 12),
+            (1, 30, 2, 24, 24),
+            (1, 30, 6, 24, 24),
+            (1, 10, 10, 10, 11),
+            (1, 1, 10, 10, 1025),
+            (1, 1, 10, 10, 1023),
+            (1, 1, 10, 1025, 10),
+            (1, 1, 10, 1023, 10),
+            (3, 2, 6, 6, 6, 5),
+            (3, 2, 6, 6, 6, 5, 7), ]
+
+    numpy.random.RandomState(utt.fetch_seed()).shuffle(shps)
+    test_ws = (2, 2, 2), (3, 2, 3), (1, 1, 1)
+    test_st = (2, 2, 2), (2, 3, 2), (1, 1, 1)
+    test_mode = ['max', 'sum', 'average_inc_pad', 'average_exc_pad']
+
+    ref_mode = copy.copy(mode_without_gpu)
+    ref_mode.check_py_code = False
+    gpu_mode = copy.copy(mode_with_gpu).excluding("cudnn")
+    gpu_mode.check_py_code = False
+
+    for shp in shps:
+        for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
+            if ws[0] > shp[-3] or ws[1] > shp[-2] or ws[2] > shp[-1]:
+                continue
+            for ignore_border, pad in zip((True, False), [(1, 1, 1), (0, 0, 0)]):
+                if pad[0] >= ws[0] or pad[1] >= ws[1] or pad[2] >= ws[2]:
+                    continue
+                if mode == 'average_exc_pad' and (pad[0] > 0 or pad[1] > 0 or pad[2] > 0):
+                    continue
+                # print('test_pool3d', shp, ws, st, pad, mode, ignore_border)
+                ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
+
+                a = theano.shared(rand(*shp), 'a')
+                a_pooled = ds_op(tensor.as_tensor_variable(a), ws, st, pad)
+
+                f = theano.function([], a_pooled, mode=gpu_mode)
+                f2 = theano.function([], a_pooled, mode=ref_mode)
+
+                assert any([isinstance(node.op, GpuPool)
+                            for node in f.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, Pool)
+                            for node in f2.maker.fgraph.toposort()])
+                assert numpy.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
+
+                a_pooled_grad = tensor.grad(a_pooled.sum(), a)
+
+                g = theano.function([], a_pooled_grad, mode=gpu_mode)
+                g2 = theano.function([], a_pooled_grad, mode=ref_mode)
+
+                if mode == 'max':
+                    gop = GpuMaxPoolGrad
+                    gop2 = MaxPoolGrad
+                else:
+                    gop = GpuAveragePoolGrad
+                    gop2 = AveragePoolGrad
+                assert any([isinstance(node.op, gop)
+                            for node in g.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, gop2)
+                            for node in g2.maker.fgraph.toposort()])
+
+                assert numpy.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
+
+                # test grad grad for max pooling
+                # for average pooling grad grad is just average pooling grad
+                if mode != 'max':
+                    continue
+
+                ggf = gradient.Lop(tensor.grad((a_pooled**2).sum(), a), a, a)
+
+                gg = theano.function([], ggf, mode=gpu_mode)
+                gg2 = theano.function([], ggf, mode=ref_mode)
+
+                assert any([
+                    isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
+                    for node in gg.maker.fgraph.toposort()
+                ])
+                assert any([
+                    isinstance(node.op, DownsampleFactorMaxGradGrad)
+                    for node in gg2.maker.fgraph.toposort()
+                ])
+                assert numpy.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)