Implements gpu max pooling rop for 2d and 3d inputs

Simply reuses gpu max pooling where instead we collect eval point with index which corresponds to the maximum in the input.

Implements gpu max pooling rop for 2d and 3d inputs
9866ce20 · Alexander Matyasko · c38f534f · 9866ce20 · 9866ce20
--- a/theano/gpuarray/pool.py
+++ b/theano/gpuarray/pool.py
@@ -112,6 +112,22 @@ class GpuPool(CGpuKernelBase):
    def connection_pattern(self, node):
        return [[1], [0], [0], [0]]
+    def R_op(self, inputs, eval_points):
+        if self.mode != 'max':
+            # Rop for average or sum is simply pooling evaluated at eval point
+            eval_inputs = [eval_points[0]] + inputs[1:]
+            return [self(*eval_inputs)]
+        # R_op can receive None as eval_points.
+        # That mean there is no diferientiable path through that input
+        # If this imply that you cannot compute some outputs,
+        # return None for those.
+        if eval_points[0] is None:
+            return [None]
+        x, ws, stride, pad = inputs
+        rop = GpuMaxPoolRop(ignore_border=self.ignore_border)
+        return [rop(x, eval_points[0], ws, stride=stride, pad=pad)]
 class GpuMaxPoolGrad(CGpuKernelBase):
    """
@@ -334,3 +350,72 @@ class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
    def connection_pattern(self, node):
        return [[1], [1], [1], [0], [0], [0]]
+class GpuMaxPoolRop(CGpuKernelBase):
+    """
+    Implements the R-operator for the downsample operation.
+    """
+    __props__ = ('ignore_border', 'mode', 'ndim')
+    def __init__(self, ignore_border, mode='max', ndim=2):
+        self.ndim = ndim
+        self.ignore_border = ignore_border
+        self.mode = mode
+        CGpuKernelBase.__init__(self, ['pool_max_rop.c'],
+                                'APPLY_SPECIFIC(max_pool_rop)')
+        assert mode == 'max'
+        assert ndim in [2, 3]
+    def c_headers(self):
+        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__), pygpu.get_include()]
+    def make_node(self, inp, eval_point, ws, stride=None, pad=None):
+        ctx_name = infer_context_name(inp)
+        nd = self.ndim
+        inp = as_gpuarray_variable(inp, ctx_name)
+        assert (inp.ndim == nd + 2)
+        eval_point = as_gpuarray_variable(eval_point, ctx_name)
+        assert (eval_point.ndim == nd + 2)
+        if stride is None:
+            stride = ws
+        if pad is None:
+            pad = (0,) * nd
+        elif isinstance(pad, (tuple, list)):
+            if max(pad) != 0 and not self.ignore_border:
+                raise ValueError('Padding works only with ignore_border=True')
+            if isinstance(ws, (tuple, list)):
+                if any(pad[i] >= ws[i] for i in range(nd)):
+                    raise ValueError('Padding must be smaller than strides')
+        ws = as_tensor_variable(ws)
+        stride = as_tensor_variable(stride)
+        pad = as_tensor_variable(pad)
+        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
+        assert ws.ndim == 1
+        if not ws.dtype.startswith('int'):
+            raise TypeError('Window shape parameters must be ints.')
+        if not stride.dtype.startswith('int'):
+            raise TypeError('Stride parameters must be ints.')
+        if not pad.dtype.startswith('int'):
+            raise TypeError('Padding parameters must be ints.')
+        return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        ignore_border = int(self.ignore_border)
+        return [('IGNORE_BORDER', ignore_border)]
+    def infer_shape(self, node, in_shapes):
+        ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
+        shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
+                             pad, self.ndim)
+        return [shp]
--- a/theano/gpuarray/pool_max_rop.c
+++ b/theano/gpuarray/pool_max_rop.c
+#section kernels
+#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * :
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size pooled_height,
+   const ga_size pooled_width, const ga_size height, const ga_size width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
+   const ga_size kernel_h, const ga_size kernel_w,
+   const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_h, const ga_size pad_w,
+   GLOBAL_MEM DTYPE_o0 *z)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    const ga_size pw = index % pooled_width;
+    const ga_size ph = (index / pooled_width) % pooled_height;
+    const ga_size c = (index / pooled_width / pooled_height) % channels;
+    const ga_size n = (index / pooled_width / pooled_height / channels);
+    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    const ga_size hend = min(hstart + kernel_h, height);
+    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    const ga_size wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const ga_size offset = (n*channels + c) * height * width;
+    const DTYPE_i0* x_slice = x + offset;
+    const DTYPE_i1* ex_slice = ex + offset;
+    DTYPE_o0 maxval = x_slice[hstart*width + wstart];
+    DTYPE_o0 collector = ex_slice[hstart*width + wstart];
+    for (ga_size h=hstart; h < hend; ++h) {
+      for (ga_size w=wstart; w < wend; ++w) {
+        // maximum in the region
+        if (x_slice[h*width + w] > maxval) {
+          maxval = x_slice[h*width + w];
+          collector = ex_slice[h*width + w];
+        }
+      }
+    }
+    z[index] = collector;
+  }
+}
+#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * :
+// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
+   const ga_size num, const ga_size channels, const ga_size pooled_depth,
+   const ga_size pooled_height, const ga_size pooled_width,
+   const ga_size depth, const ga_size height, const ga_size width,
+   GLOBAL_MEM const DTYPE_i0 *x, GLOBAL_MEM const DTYPE_i1 *ex,
+   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
+   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
+   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
+   GLOBAL_MEM DTYPE_o0 *z)
+{
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    const ga_size pw = index % pooled_width;
+    const ga_size ph = (index / pooled_width) % pooled_height;
+    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
+    ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
+    const ga_size dend = min(dstart + kernel_d, depth);
+    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
+    const ga_size hend = min(hstart + kernel_h, height);
+    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
+    const ga_size wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const ga_size offset = (n*channels + c) * depth * height * width;
+    const DTYPE_i0* x_slice = x + offset;
+    const DTYPE_i1* ex_slice = ex + offset;
+    DTYPE_o0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
+    DTYPE_o0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
+    for (ga_size d=dstart; d < dend; ++d) {
+      for (ga_size h=hstart; h < hend; ++h) {
+        for (ga_size w=wstart; w < wend; ++w) {
+          // maximum in the region
+          if (x_slice[(d*height + h)*width + w] > maxval) {
+            maxval = x_slice[(d*height + h)*width + w];
+            collector = ex_slice[(d*height + h)*width + w];
+          }
+        }
+      }
+    }
+    z[index] = collector;
+  }
+}
+#section support_code
+// output shape for a given input padded shape, window shape and stride
+#define OUTPUT_DIMS(in_dim, ws, st)                       \
+  (IGNORE_BORDER ? (in_dim - ws)/st + 1 :                 \
+   (st > ws ? (in_dim - 1)/st + 1 :                       \
+    std::max<size_t>(0, (in_dim - 1 - ws + st)/st) + 1))
+#section support_code_struct
+int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
+                                 PyGpuArrayObject *ex,
+                                 PyArrayObject *ws,
+                                 PyArrayObject *stride,
+                                 PyArrayObject *pad,
+                                 PyGpuArrayObject **z,
+                                 PyGpuContextObject *ctx) {
+  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
+    {
+      PyErr_Format(PyExc_ValueError,
+                   "GpuMaxPoolRop: requires data to be C-contiguous");
+      return 1;
+    }
+  size_t ndims = PyArray_DIM(ws, 0);
+  if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
+    {
+      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
+      return 1;
+    }
+  // prepare output
+  const size_t* x_dims = PyGpuArray_DIMS(x);
+  size_t z_dims[5]; // avoid warning if use 2 + nd
+  size_t w[3];
+  size_t s[3];
+  size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
+  int nonzero_padding = 0;
+  for (int i = 0; i < ndims; i++) {
+    w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+    s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+    p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
+    if (p[i] > 0) {
+      nonzero_padding = 1;
+    }
+  }
+  if (!IGNORE_BORDER && nonzero_padding) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMaxPoolRop: padding works only with ignore_border=True");
+    return 1;
+  }
+  if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
+                         ex->ga.typecode, GA_C_ORDER, ctx) != 0)
+    {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMaxPoolRop: failed to allocate memory");
+      return 1;
+    }
+  {
+    // scope for running kernel
+    int err;
+    if (ndims == 2) {
+      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
+      err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
+                                    z_dims[0], z_dims[1], z_dims[2], z_dims[3],
+                                    x_dims[2], x_dims[3],
+                                    x->ga.data, ex->ga.data,
+                                    w[0], w[1], s[0], s[1], p[0], p[1],
+                                    (*z)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
+                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
+        return 1;
+      }
+    }
+    else if (ndims == 3) {
+      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
+      err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
+                                    z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
+                                    x_dims[2], x_dims[3], x_dims[4],
+                                    x->ga.data, ex->ga.data,
+                                    w[0], w[1], w[2], s[0], s[1], s[2],
+                                    p[0], p[1], p[2], (*z)->ga.data);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
+                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
+        return 1;
+      }
+    }
+  }
+  return 0;
+}