Add Max3dPoolGradGrad and padding

Add Max3dPoolGradGrad which accept 5d input and pools over last 3 dimensions. Also add padding parameter and now accept tensor variables to make_node which removes necessity to check for constant args. Local optimization are updated accordingly which now work for 2d and 3d pooling.

Add Max3dPoolGradGrad and padding
9411992c · Alexander Matyasko · a43a8425 · 9411992c · 9411992c · 9411992c
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -1542,14 +1542,16 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
    Implement the grad of downsample with max on the gpu.
    """
-    __props__ = ('ds', 'st', 'ignore_border')
+    __props__ = ('ignore_border', 'mode', 'ndim')
-    def __init__(self, ds, st=None, ignore_border=False):
+    def __init__(self, ignore_border, mode='max', ndim=2):
-        self.ds = tuple(ds)
+        self.ndim = ndim
-        self.st = self.ds if st is None else tuple(st)
        self.ignore_border = ignore_border
+        self.mode = mode
        CGpuKernelBase.__init__(self, ['pool_grad_grad.c'],
                                'APPLY_SPECIFIC(pool_grad_grad)')
+        assert self.mode == 'max'
+        assert self.ndim in [2, 3]
    def c_headers(self):
        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
@@ -1559,31 +1561,29 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
    def c_header_dirs(self):
        return [os.path.dirname(__file__), pygpu.get_include()]
-    def make_node(self, x, z, gx):
+    def make_node(self, inp, out, out_grad, ws, stride, pad):
-        ctx_name = infer_context_name(x, z, gx)
+        ctx_name = infer_context_name(inp, out, out_grad)
-        x = as_gpuarray_variable(x, ctx_name)
+        inp = as_gpuarray_variable(inp, ctx_name)
-        z = as_gpuarray_variable(z, ctx_name)
+        assert (inp.ndim in [4, 5])
-        gx = as_gpuarray_variable(gx, ctx_name)
+        out = as_gpuarray_variable(out, ctx_name)
+        assert (out_grad.ndim in [4, 5])
+        out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        assert(out.ndim in [4, 5])
+        assert (out_grad.ndim == inp.ndim)
+        assert (inp.ndim == out.ndim)
-        if x.type.ndim != 4:
+        ws = as_tensor_variable(ws)
-            raise TypeError('x must be 4D tensor')
+        stride = as_tensor_variable(stride)
-        if z.type.ndim != 4:
+        pad = as_tensor_variable(pad)
-            raise TypeError('z must be 4D tensor')
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
-        if gx.type.ndim != 4:
+        assert ws.type.ndim == 1
-            raise TypeError('gx must be 4D tensor')
-        return Apply(self, [x, z, gx], [x.type()])
+        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
    def get_params(self, node):
        return node.inputs[0].type.context
-    def get_op_params(self):
-        ds0, ds1 = self.ds
-        st0, st1 = self.st
-        ignore_border = int(self.ignore_border)
-        return [('DS0', ds0), ('DS1', ds1), ('ST0', st0), ('ST1', st1),
-                ('IGNORE_BORDER', ignore_border)]
 @inplace_allocempty(GpuGemv, 0)
 def local_inplace_gpuagemv(node, inputs):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1595,28 +1595,26 @@ def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs):
 @op_lifter([pool.DownsampleFactorMaxGradGrad])
 @register_opt2([pool.DownsampleFactorMaxGradGrad])
 def local_gpu_downsample_factor_max_grad_grad(op, ctx_name, inputs, outputs):
-    from theano.sandbox.cuda.opt import _check_constant_args_pool
    assert op.__props__ == ('ignore_border', 'mode', 'ndim')
-    inp, out, out_grad, ws, st, pad = inputs
+    inp, out, out_grad, ws, stride, pad = inputs
    nd = op.ndim
-    ret = _check_constant_args_pool(nd, ws, st, pad, op)
+    if nd not in (2, 3):
-    if ret is None:
-        return
-    ws, st, pad = ret
-    if nd != 2 or max(pad) != 0 or op.mode != 'max':
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
-    op = GpuDownsampleFactorMaxGradGrad(ws, st, op.ignore_border)
+    op = GpuDownsampleFactorMaxGradGrad(op.ignore_border, op.mode, op.ndim)
    if inp.ndim == nd + 2:
-        return op(inp, out, out_grad)
+        return op(inp, out, out_grad, ws, stride, pad)
    else:
-        inp_4D = pad_dims(inp, 2, 2)
+        # reshape to 4D or 5D with 2 non-pooling dimensions
-        out_4D = pad_dims(out, 2, 2)
+        inp_padded = pad_dims(inp, 2, nd)
-        out_grad_4D = pad_dims(out_grad, 2, 2)
+        out_padded = pad_dims(out, 2, nd)
-        output_4D = op(inp_4D, out_4D, out_grad_4D)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
-        return unpad_dims(output_4D, inp, 2, 2)
+        ret_padded = op(inp_padded, out_padded, out_grad_padded,
+                        ws, stride, pad)
+        return unpad_dims(ret_padded, inp, 2, nd)
 @register_opt("low_memory")

--- a/theano/gpuarray/pool_grad_grad.c
+++ b/theano/gpuarray/pool_grad_grad.c
 #section kernels
-#kernel pool_grad_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, * :
+#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * :
-KERNEL void pool_grad_grad_kernel(const ga_size nthreads,
+KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
   const ga_size pooled_width, const ga_size height, const ga_size width,
   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gx,
   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_h, const ga_size pad_w,
   GLOBAL_MEM DTYPE_o0 *gz)
 {
  // grid stride looping
@@ -16,9 +17,11 @@ KERNEL void pool_grad_grad_kernel(const ga_size nthreads,
    const ga_size ph = (index / pooled_width) % pooled_height;
    const ga_size c = (index / pooled_width / pooled_height) % channels;
    const ga_size n = (index / pooled_width / pooled_height / channels);
-    const ga_size hstart = ph*stride_h;
+    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    hstart = max(hstart, 0);
    const ga_size hend = min(hstart + kernel_h, height);
-    const ga_size wstart = pw*stride_w;
+    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    wstart = max(wstart, 0);
    const ga_size wend = min(wstart + kernel_w, width);
    const ga_size offset = (n*channels + c) * height * width;
@@ -38,44 +41,102 @@ KERNEL void pool_grad_grad_kernel(const ga_size nthreads,
    gz[index] = gradient;
  }
 }
+#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * :
+KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size pooled_depth,
+   const ga_size pooled_height, const ga_size pooled_width,
+   const ga_size depth, const ga_size height, const ga_size width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *z, GLOBAL_MEM const DTYPE_i2 *gx,
+   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
+   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
+   GLOBAL_MEM DTYPE_o0 *gz)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads; index += LDIM_0 * GDIM_0) {
+    const ga_size pw = index % pooled_width;
+    const ga_size ph = (index / pooled_width) % pooled_height;
+    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
+    ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
+    dstart = max(dstart, 0);
+    const ga_size dend = min(dstart + kernel_d, depth);
+    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    hstart = max(hstart, 0);
+    const ga_size hend = min(hstart + kernel_h, height);
+    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    wstart = max(wstart, 0);
+    const ga_size wend = min(wstart + kernel_w, width);
+    const ga_size offset = (n*channels + c) * depth * height * width;
+    const DTYPE_i0* x_slice = x + offset;
+    const DTYPE_i2* gx_slice = gx + offset;
+    DTYPE_o0 gradient = 0;
+    for (ga_size d=dstart; d < dend; ++d) {
+      for (ga_size h=hstart; h < hend; ++h) {
+        for (ga_size w=wstart; w < wend; ++w) {
+          // maximum in the region
+          if (z[index] == x_slice[(d * height + h) * width + w]) {
+            gradient += gx_slice[(d * height + h)* width + w];
+          }
+        }
+      }
+    }
+    gz[index] = gradient;
+  }
+}
 #section support_code_struct
 int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
                                   PyGpuArrayObject *z,
                                   PyGpuArrayObject *gx,
+                                   PyArrayObject *ws,
+                                   PyArrayObject *stride,
+                                   PyArrayObject *pad,
                                   PyGpuArrayObject **gz,
                                   PyGpuContextObject *ctx) {
-  if (PyGpuArray_NDIM(x) != 4
+  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
-      || PyGpuArray_NDIM(z) != 4
+      || !GpuArray_IS_C_CONTIGUOUS(&z->ga)
-      || PyGpuArray_NDIM(gx) != 4)
+      || !GpuArray_IS_C_CONTIGUOUS(&gx->ga))
    {
-      PyErr_SetString(PyExc_ValueError, "GpuDownsampleFactorMaxGradGrad: rank error");
+      PyErr_Format(PyExc_ValueError,
+                   "GpuPoolingGradGrad: requires data to be C-contiguous");
      return 1;
    }
-  if (NULL == *gz || theano_size_check(*gz, 4, PyGpuArray_DIMS(z), z->ga.typecode))
+  size_t ndims = PyArray_DIM(ws, 0);
+  if (PyGpuArray_NDIM(x) != ndims + 2
+      || PyGpuArray_NDIM(z) != ndims + 2
+      || PyGpuArray_NDIM(gx) != ndims + 2)
    {
-      Py_XDECREF(*gz);
+      PyErr_SetString(PyExc_ValueError, "GpuPoolingGradGrad: rank error");
-      *gz = pygpu_zeros(4, PyGpuArray_DIMS(z),
+      return 1;
-                        z->ga.typecode, GA_C_ORDER,
-                        ctx, Py_None);
-      if (NULL == *gz)
-        {
-          PyErr_SetString(PyExc_RuntimeError,
-                          "GpuDownsampleFactorMaxGradGrad: failed to allocate memory");
-          return 1;
-        }
    }
-  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
+  if (theano_prep_output(gz, PyGpuArray_NDIM(z), PyGpuArray_DIMS(z),
-      || !GpuArray_IS_C_CONTIGUOUS(&z->ga)
+                         z->ga.typecode, GA_C_ORDER, ctx) != 0)
-      || !GpuArray_IS_C_CONTIGUOUS(&gx->ga)
-      || !GpuArray_IS_C_CONTIGUOUS(&(*gz)->ga))
    {
-      PyErr_Format(PyExc_ValueError,
+      PyErr_SetString(PyExc_RuntimeError,
-                   "GpuDownsampleFactorMaxGradGrad: requires data to be C-contiguous");
+                      "GpuPoolingGradGrad: failed to allocate memory");
      return 1;
    }
  {
    // scope for running kernel
+    size_t w[3];
+    size_t s[3];
+    size_t p[3];
+    for(int i = 0; i < ndims; i++) {
+      w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+      s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+      p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+    }
    size_t max_threads_dim;
    int err;
    const size_t* z_dims = PyGpuArray_DIMS(z);
@@ -87,22 +148,41 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
      PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
      return 1;
    }
-    size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
    size_t threads_per_block = max_threads_dim;
-    size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
-    err = pool_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
+    if (ndims == 2) {
-                                     num_kernels,
+      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
-                                     z_dims[0], z_dims[1], z_dims[2], z_dims[3],
+      size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
-                                     x_dims[2], x_dims[3],
+      err = max_pool2d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
-                                     x->ga.data, z->ga.data, gx->ga.data,
+                                             num_kernels,
-                                     DS0, DS1, ST0, ST1,
+                                             z_dims[0], z_dims[1], z_dims[2], z_dims[3],
-                                     (*gz)->ga.data);
+                                             x_dims[2], x_dims[3],
-    if (err != GA_NO_ERROR) {
+                                             x->ga.data, z->ga.data, gx->ga.data,
-      PyErr_Format(PyExc_RuntimeError,
+                                             w[0], w[1], s[0], s[1], p[0], p[1],
-                   "GpuDownsampleFactorMaxGradGrad: %s.",
+                                             (*gz)->ga.data);
-                   GpuKernel_error(&k_pool_grad_grad_kernel, err));
+      if (err != GA_NO_ERROR) {
-      return 1;
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
+                     GpuKernel_error(&k_max_pool2d_grad_grad_kernel, err));
+        return 1;
+      }
+    }
+    else if (ndims == 3) {
+      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
+      size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
+      err = max_pool3d_grad_grad_kernel_call(1, &n_blocks, &threads_per_block, 0,
+                                             num_kernels,
+                                             z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
+                                             x_dims[2], x_dims[3], x_dims[4],
+                                             x->ga.data, z->ga.data, gx->ga.data,
+                                             w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
+                                             (*gz)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
+                     GpuKernel_error(&k_max_pool3d_grad_grad_kernel, err));
+        return 1;
+      }
    }
  }
  return 0;