Port the fix to the new back-end.

f1bc93a6 · Frederic Bastien · 23bca43d · f1bc93a6 · f1bc93a6 · f1bc93a6
--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -52,9 +52,6 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    return 1;
  }

-  if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
-    return 1;
-
  cudnnPoolingMode_t mode;
  int w[3];
  int p[3];
@@ -71,12 +68,6 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }

-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
-  }
-
  dims[0] = PyGpuArray_DIM(img, 0);
  dims[1] = PyGpuArray_DIM(img, 1);
  dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
@@ -88,9 +79,23 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                         GA_C_ORDER, c) != 0)
    return 1;

+  // if input batch is empty, we return the empty output without calling cuDNN
+  // (which will fail on zero batch size).
+  if (PyGpuArray_DIM(*out, 0) == 0)
+    return 0;
+
+  if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
    return 1;

+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
+  }
+
  {
    const float alphaf = 1;
    const float betaf = 0;

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -83,6 +83,17 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    return 1;
  }

+  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
+                         PyGpuArray_DIMS(inp), inp->ga.typecode,
+                         GA_C_ORDER, c) != 0) {
+    return 1;
+  }
+
+  // if input batch is empty, we return the empty output without calling cuDNN
+  // (which will fail on zero batch size).
+  if (PyGpuArray_DIM(*inp_grad, 0) == 0)
+    return 0;
+
  if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
    return 1;
  if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
@@ -90,12 +101,6 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
  if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
    return 1;

-  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
-                         PyGpuArray_DIMS(inp), inp->ga.typecode,
-                         GA_C_ORDER, c) != 0) {
-    return 1;
-  }
-
  int w[3];
  int p[3];
  int s[3];

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -502,6 +502,22 @@ def test_pooling_opt_arbitrary_dimensions():
                utt.assert_allclose(res_gpu[1], res_cpu[1])


+def test_pooling_empty_batch():
+    img_shp = (0, 5, 6, 8)
+    img = T.ftensor4('img')
+
+    o = dnn.dnn_pool(img, (2, 2), (2, 2))
+    f = theano.function([img], o, mode=mode_with_gpu)
+    d = f(np.random.rand(*img_shp).astype('float32'))
+    assert d.shape == (0, 5, 3, 4)
+
+    g = T.grad(T.sum(o), wrt=img)
+    f = theano.function([img], g, mode=mode_with_gpu)
+    d = f(np.random.rand(*img_shp).astype('float32'))
+    # Not sure what to assert, it should just pass, that's all.
+    assert d.shape == (0, 5, 6, 8)
+
+
 def test_dnn_tag():
    """
    Test that if cudnn isn't avail we crash and that if it is avail, we use it.