Merge pull request #2696 from yaoli/maxpool_c_code

c code of maxpool

Merge pull request #2696 from yaoli/maxpool_c_code
fb3b1fcf · Pascal Lamblin · 2b2cd647 · 75981448 · fb3b1fcf · fb3b1fcf
--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -287,77 +287,149 @@ class DownsampleFactorMax(Op):
                                        st=self.st, padding=self.padding)(
                                            x, maxout, gz)]

+    def c_headers(self):
+        return ['<algorithm>']
+
    def c_code(self, node, name, inp, out, sub):
-        # No implementation is currently for the case where
-        # the stride size and the pooling size are different.
-        # An exception is raised for such a case.
-        if self.ds != self.st or self.padding != (0, 0):
-            raise theano.gof.utils.MethodNotDefined()
        x, = inp
        z, = out
        fail = sub['fail']
        ignore_border = int(self.ignore_border)
        ds0, ds1 = self.ds
+        st0, st1 = self.st
+        pd0, pd1 = self.padding
        return """
        int typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
-        int x_shp0_usable;
-        int x_shp1_usable;
-        int z_shp0, z_shp1;
+        int z_r, z_c; // shape of the output
+        int r, c; // shape of the padded_input
        if(PyArray_NDIM(%(x)s)!=4)
        {
            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
            %(fail)s;
        }
-        z_shp0 = PyArray_DIMS(%(x)s)[2] / %(ds0)s;
-        z_shp1 = PyArray_DIMS(%(x)s)[3] / %(ds1)s;
+        r = PyArray_DIMS(%(x)s)[2];
+        c = PyArray_DIMS(%(x)s)[3];
+        r += %(pd0)s * 2;
+        c += %(pd1)s * 2;
+
+        if (%(pd0)s != 0 && %(pd1)s != 0 && !%(ignore_border)s)
+            {
+              PyErr_SetString(PyExc_ValueError,
+                "padding must be (0,0) when ignore border is False");
+              %(fail)s;
+            }
        if (%(ignore_border)s)
        {
-            x_shp0_usable = z_shp0 * %(ds0)s;
-            x_shp1_usable = z_shp1 * %(ds1)s;
+            
+            // '/' in C is different from '/' in python
+            if (r - %(ds0)s < 0)
+            {
+              z_r = 0;
+            }
+            else
+            {
+              z_r = (r - %(ds0)s) / %(st0)s + 1;
+            }
+            if (c - %(ds1)s < 0)
+            {
+              z_c = 0;
+            }
+            else
+            {
+              z_c = (c - %(ds1)s) / %(st1)s + 1;
+            }
        }
        else
        {
-            z_shp0 += (PyArray_DIMS(%(x)s)[2] %% %(ds0)s) ? 1 : 0;
-            z_shp1 += (PyArray_DIMS(%(x)s)[3] %% %(ds1)s) ? 1 : 0;
-            x_shp0_usable = PyArray_DIMS(%(x)s)[2];
-            x_shp1_usable = PyArray_DIMS(%(x)s)[3];
+            // decide how many rows the output has
+            if (%(st0)s >= %(ds0)s)
+            {
+                z_r = (r - 1) / %(st0)s + 1;
+            }
+            else
+            {
+                z_r = std::max(0, (r - 1 - %(ds0)s) / %(st0)s + 1) + 1;
+            }
+            // decide how many columns the output has
+            if (%(st1)s >= %(ds1)s)
+            {
+                z_c = (c - 1) / %(st1)s + 1;
+            }
+            else
+            {
+                z_c = std::max(0, (c - 1 - %(ds1)s) / %(st1)s + 1) + 1;
+            }
        }
+        // memory allocation of z if necessary
        if ((!%(z)s)
          || *PyArray_DIMS(%(z)s)!=4
          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(x)s)[0])
          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(x)s)[1])
-          ||(PyArray_DIMS(%(z)s)[2] != z_shp0)
-          ||(PyArray_DIMS(%(z)s)[3] != z_shp1)
+          ||(PyArray_DIMS(%(z)s)[2] != z_r)
+          ||(PyArray_DIMS(%(z)s)[3] != z_c)
          )
        {
          if (%(z)s) Py_XDECREF(%(z)s);
          npy_intp dims[4] = {0,0,0,0};
          dims[0]=PyArray_DIMS(%(x)s)[0];
          dims[1]=PyArray_DIMS(%(x)s)[1];
-          dims[2]=z_shp0;
-          dims[3]=z_shp1;
+          dims[2]=z_r;
+          dims[3]=z_c;
          //TODO: zeros not necessary
          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
        }

-        if (z_shp0 && z_shp1)
+        // used for indexing a pool region inside the input
+        int r_st, r_end, c_st, c_end;
+        dtype_%(x)s maximum; // temp var for maximum value in a region
+        if (z_r && z_c)
        {
-            for(int b=0;b<PyArray_DIMS(%(x)s)[0];b++){
-              for(int k=0;k<PyArray_DIMS(%(x)s)[1];k++){
-                int mini_i = 0;
-                int zi = 0;
-                for(int i=0;i< x_shp0_usable; i++){
-                  int mini_j = 0;
-                  int zj = 0;
-                  for(int j=0; j<x_shp1_usable; j++){
-                    dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,i,j)))[0];
-                    dtype_%(z)s * __restrict__ z = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s,b,k,zi,zj)));
-                    z[0] = (((mini_j|mini_i) == 0) || z[0] < a) ? a : z[0];
-                    mini_j = ((mini_j + 1) == %(ds1)s) ? 0 : mini_j+1;
-                    zj += (mini_j == 0);
+            for(int b=0; b<PyArray_DIMS(%(x)s)[0]; b++){
+              for(int k=0; k<PyArray_DIMS(%(x)s)[1]; k++){
+                for(int i=0; i< z_r; i++){
+                  r_st = i * %(st0)s;
+                  r_end = r_st + %(ds0)s;
+                  // skip the padding
+                  r_st = r_st < %(pd0)s ? %(pd0)s : r_st;
+                  r_end = r_end > (r - %(pd0)s) ? r - %(pd0)s : r_end;
+                  // from padded_img space to img space
+                  r_st -= %(pd0)s;
+                  r_end -= %(pd0)s;
+
+                  // handle the case where no padding, ignore border is True
+                  if (%(ignore_border)s)
+                  {
+                    r_end = r_end > r ? r : r_end;
+                  }
+                  for(int j=0; j<z_c; j++){
+                    c_st = j * %(st1)s;
+                    c_end = c_st + %(ds1)s;
+                    // skip the padding
+                    c_st = c_st < %(pd1)s ? %(pd1)s : c_st;
+                    c_end = c_end > (c - %(pd1)s) ? c - %(pd1)s : c_end;
+                    dtype_%(z)s * z = (
+                          (dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
+                    // change coordinates from padding_img space into img space
+                    c_st -= %(pd1)s;
+                    c_end -= %(pd1)s;
+                    // handle the case where no padding, ignore border is True
+                    if (%(ignore_border)s)
+                    {
+                      c_end = c_end > c ? c : c_end;
+                    }
+                    // use the first element as the initial value of maximum
+                    maximum = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,r_st,c_st)))[0];
+                    // go through the pooled region in the unpadded input
+                    for(int m=r_st; m<r_end; m++)
+                    {
+                      for(int n=c_st; n<c_end; n++)
+                      {
+                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
+                        maximum = (a > maximum) ? a : maximum;
+                      }
+                    }
+                    z[0] = maximum;
                  }
-                  mini_i = ((mini_i + 1) == %(ds0)s) ? 0 : mini_i+1;
-                  zi += (mini_i == 0);
                }
              }
            }
@@ -365,7 +437,7 @@ class DownsampleFactorMax(Op):
        """ % locals()

    def c_code_cache_version(self):
-        return (0, 2)
+        return (0, 6)


 class DownsampleFactorMaxGrad(Op):
@@ -409,7 +481,6 @@ class DownsampleFactorMaxGrad(Op):
        pad_w = self.padding[1]
        img_rows = x.shape[-2] + 2 * pad_h
        img_cols = x.shape[-1] + 2 * pad_w
-
        # pad the image
        if self.padding != (0, 0):
            fill = x.min()-1

--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
 import unittest
 import __builtin__
 import numpy
+import theano
 import theano.tensor as tensor
 from theano.tests import unittest_tools as utt
 from theano.tensor.signal.downsample import (DownsampleFactorMax, max_pool_2d,
@@ -467,11 +468,11 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
        test_input_array = numpy.array([[[
            [1., 2., 3., 4.],
            [5., 6., 7., 8.]
-        ]]])
+        ]]]).astype(theano.config.floatX)
        test_answer_array = numpy.array([[[
            [0., 0., 0., 0.],
            [0., 6., 0., 8.]
-        ]]])
+        ]]]).astype(theano.config.floatX)
        input = tensor.tensor4(name='input')
        patch_size = (2, 2)
        op = max_pool_2d_same_size(input, patch_size)