Suggest a fix (and tests) for gpudnnreduction when axes to reduce have size 1.

d57d2f4c · notoraptor · d395439a · d57d2f4c · d57d2f4c
--- a/theano/gpuarray/c_code/dnn_redux.c
+++ b/theano/gpuarray/c_code/dnn_redux.c
@@ -97,6 +97,49 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
      PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
      return 1;
    }
+    if (rsz == 1 && cudnnGetVersion() <= 7004) {
+      /* We must reduce some dimensions which have all size 1.
+       * cuDNN (up to 7004) does not support this case. Let's use GpuElemwise. */
+      switch (params->red_op) {
+        // Nothing to do for following cases.
+        case CUDNN_REDUCE_TENSOR_ADD: break;
+        case CUDNN_REDUCE_TENSOR_MUL: break;
+        case CUDNN_REDUCE_TENSOR_MIN: break;
+        case CUDNN_REDUCE_TENSOR_MAX: break;
+        case CUDNN_REDUCE_TENSOR_AVG: break;
+        /* Work to do for following cases.
+        AMAX (maximum on absolute values) => apply abs(output)
+        NORM1 (addition of absolute values) => apply abs(output)
+        NORM2 (square root of sum of squares) => sqroot(output^2) => abs(output)
+        So, we must apply abs(output) for all following cases.
+        */
+        case CUDNN_REDUCE_TENSOR_AMAX:
+        case CUDNN_REDUCE_TENSOR_NORM1:
+        case CUDNN_REDUCE_TENSOR_NORM2:
+        {
+            gpuelemwise_arg arg;
+            arg.name = "out";
+            arg.typecode = (*output)->ga.typecode;
+            arg.flags = GE_READ | GE_WRITE;
+            GpuElemwise* elemwise = GpuElemwise_new(c->ctx, "", "out = (out < 0 ? -out : out)", 1, &arg, p, GE_CONVERT_F16);
+            if (!elemwise) {
+                PyErr_SetString(PyExc_RuntimeError, "Unable to create GpuElemwise for output.");
+                return 1;
+            }
+            void* args[1] = { (void*)&(*output)->ga };
+            int err = GpuElemwise_call(elemwise, args, 0);
+            GpuElemwise_free(elemwise);
+            if (err != GA_NO_ERROR) {
+                PyErr_SetString(PyExc_RuntimeError, "Unable to call GpuElemwise on output.");
+                return 1;
+            };
+        }
+            break;
+        default: break;
+      }
+    }
    if (indices != NULL) {
      // All indices will be 0 since the size of the reduced area is 1.
      err = GpuArray_memset(&(*indices)->ga, 0);

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1611,6 +1611,32 @@ def test_dnn_reduction_absmax():
        utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
+def test_dnn_reduction_axis_size_one():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    for dtype in ('float16', 'float32', 'float64'):
+        for shape, axis in [[(1, 2, 3), 0], [(2, 1, 3), 1], [(2, 3, 1), 2]]:
+            x = theano.tensor.tensor3(dtype=dtype)
+            sum = x.sum(axis=axis)
+            sum_squares = (x**2).sum(axis=axis)
+            sum_abs = abs(x).sum(axis=axis)
+            absmax = abs(x).max(axis=axis)
+            f1 = theano.function([x], sum, mode=mode_with_gpu)
+            f2 = theano.function([x], sum_squares, mode=mode_with_gpu)
+            f3 = theano.function([x], sum_abs, mode=mode_with_gpu)
+            f4 = theano.function([x], absmax, mode=mode_with_gpu)
+            for fn in (f1, f2, f3, f4):
+                assert any(isinstance(node.op, dnn.GpuDnnReduction) for node in fn.maker.fgraph.apply_nodes)
+            xval = np.random.uniform(-10, -1, size=shape).astype(dtype)
+            xval_reshaped = xval.reshape(shape[:axis] + shape[(axis + 1):])
+            test_val = abs(xval_reshaped)
+            val_sum, val_sum_squares, val_sum_abs, val_absmax = f1(xval), f2(xval), f3(xval), f4(xval)
+            utt.assert_allclose(xval_reshaped, val_sum)
+            utt.assert_allclose(test_val**2, val_sum_squares)
+            utt.assert_allclose(test_val, val_sum_abs)
+            utt.assert_allclose(test_val, val_absmax)
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),