Merge pull request #6517 from notoraptor/workaround-cudnn-redux-with-axes-size-one

Suggest a fix (and tests) for gpudnnreduction when axes to reduce have size 1.

Merge pull request #6517 from notoraptor/workaround-cudnn-redux-with-axes-size-one
342d4d11 · Frédéric Bastien · GitHub · 7650d093 · bc1b615b · 342d4d11
--- a/theano/gpuarray/c_code/dnn_redux.c
+++ b/theano/gpuarray/c_code/dnn_redux.c
@@ -3,7 +3,8 @@
 cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
-
+GpuElemwise* elemwise;
+gpuelemwise_arg arg;

 #section init_code_struct

@@ -28,12 +29,18 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red
  FAIL;
 }

+elemwise = NULL;
+
 #section cleanup_code_struct

 if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
 if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }

+if (elemwise) {
+    GpuElemwise_free(elemwise);
+    elemwise = NULL;
+}

 #section support_code_struct

@@ -97,6 +104,49 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
      PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
      return 1;
    }
+
+    if (rsz == 1) {
+      /* We must reduce some dimensions which have all size 1.
+       * cuDNN (up to 7004) does not support this case. Let's use GpuElemwise. */
+      switch (params->red_op) {
+        // Nothing to do for following cases.
+        case CUDNN_REDUCE_TENSOR_ADD: break;
+        case CUDNN_REDUCE_TENSOR_MUL: break;
+        case CUDNN_REDUCE_TENSOR_MIN: break;
+        case CUDNN_REDUCE_TENSOR_MAX: break;
+        case CUDNN_REDUCE_TENSOR_AVG: break;
+        /* Work to do for following cases.
+        AMAX (maximum on absolute values) => apply abs(output)
+        NORM1 (addition of absolute values) => apply abs(output)
+        NORM2 (square root of sum of squares) => sqroot(output^2) => abs(output)
+        So, we must apply abs(output) for all following cases.
+        */
+        case CUDNN_REDUCE_TENSOR_AMAX:
+        case CUDNN_REDUCE_TENSOR_NORM1:
+        case CUDNN_REDUCE_TENSOR_NORM2:
+        {
+            if (elemwise == NULL) {
+              arg.name = "out";
+              arg.typecode = (*output)->ga.typecode;
+              arg.flags = GE_READ | GE_WRITE;
+              elemwise = GpuElemwise_new(c->ctx, "", "out = (out < 0 ? -out : out)", 1, &arg, p, GE_CONVERT_F16);
+              if (!elemwise) {
+                  PyErr_SetString(PyExc_RuntimeError, "Unable to create GpuElemwise for output.");
+                  return 1;
+              }
+            }
+            void* args[1] = { (void*)&(*output)->ga };
+            int err = GpuElemwise_call(elemwise, args, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_SetString(PyExc_RuntimeError, "Unable to call GpuElemwise on output.");
+                return 1;
+            };
+        }
+            break;
+        default: break;
+      }
+    }
+
    if (indices != NULL) {
      // All indices will be 0 since the size of the reduced area is 1.
      err = GpuArray_memset(&(*indices)->ga, 0);

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1258,9 +1258,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            greduce = op2(
                op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
-                dtype=getattr(op, 'dtype', outputs[0].dtype),
-                acc_dtype=getattr(op, 'acc_dtype', None))
-
+                dtype=odtype,
+                acc_dtype=adtype)
            with inherit_stack_trace(outputs):
                reshaped_x = x.reshape(tensor.stack(new_in_shp))
                gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1611,6 +1611,53 @@ def test_dnn_reduction_absmax():
        utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))


+def test_dnn_reduction_axis_size_one():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+
+    for dtype in ('float16', 'float32', 'float64'):
+        for shape, axis in [[(1, 2, 3), 0],
+                            [(2, 1, 3), 1],
+                            [(2, 3, 1), 2],
+                            [(1, 5, 1), (0, 2)],
+                            [(4, 1, 6, 1), (1, 3)]]:
+
+            x = theano.tensor.TensorType(dtype=dtype, broadcastable=[False] * len(shape))()
+            sum = x.sum(axis=axis)
+            sum_squares = (x**2).sum(axis=axis)
+            sum_abs = abs(x).sum(axis=axis)
+            absmax = abs(x).max(axis=axis)
+
+            cpu_f = theano.function([x], [sum, sum_squares, sum_abs, absmax], mode=mode_without_gpu)
+            f1 = theano.function([x], sum, mode=mode_with_gpu)
+            f2 = theano.function([x], sum_squares, mode=mode_with_gpu)
+            f3 = theano.function([x], sum_abs, mode=mode_with_gpu)
+            f4 = theano.function([x], absmax, mode=mode_with_gpu)
+
+            for fn, red_op in ((f1, 'add'), (f2, 'norm2'), (f3, 'norm1'), (f4, 'absmax')):
+                assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == red_op
+                           for node in fn.maker.fgraph.apply_nodes)
+
+            xval = np.random.uniform(-10, -1, size=shape).astype(dtype)
+            if isinstance(axis, int):
+                xval_reshaped = xval.reshape(shape[:axis] + shape[(axis + 1):])
+            else:
+                xval_reshaped = xval.reshape([n for i, n in enumerate(shape) if i not in axis])
+            test_val = abs(xval_reshaped)
+
+            val_sum, val_sum_squares, val_sum_abs, val_absmax = f1(xval), f2(xval), f3(xval), f4(xval)
+            cpu_val_sum, cpu_val_sum_squares, cpu_val_sum_abs, cpu_val_absmax = cpu_f(xval)
+
+            utt.assert_allclose(cpu_val_sum, val_sum)
+            utt.assert_allclose(cpu_val_sum_squares, val_sum_squares)
+            utt.assert_allclose(cpu_val_sum_abs, val_sum_abs)
+            utt.assert_allclose(cpu_val_absmax, val_absmax)
+            utt.assert_allclose(xval_reshaped, val_sum)
+            utt.assert_allclose(test_val**2, val_sum_squares)
+            utt.assert_allclose(test_val, val_sum_abs)
+            utt.assert_allclose(test_val, val_absmax)
+
+
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),