Merge pull request #6045 from notoraptor/fix-5985-6020

Work around erroneous dnn_conv results with algo small an large batches

Merge pull request #6045 from notoraptor/fix-5985-6020
02e127c1 · Frédéric Bastien · GitHub · d9263f62 · 1cfc2491 · 02e127c1
--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -163,12 +163,20 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
  }
-  /* These two algos are not supported for 3d conv */
+  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
  if (PyGpuArray_NDIM(input) == 5 &&
-      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
+      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
+        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
+        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
+  // Issue should be resolved for cuDNN > V6.0.
+  if (cudnnGetVersion() < 6100 &&
+      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
+      PyGpuArray_DIM(input, 0) > 65536)
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
  // does not support strides.

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1067,6 +1067,63 @@ def get_conv3d_test_cases():
    return itt
+def run_conv_small_batched_vs_multicall(inputs_shape, filters_shape, batch_sub):
+    # Function to check issue #5985 (see tests below): https://github.com/Theano/Theano/issues/5985
+    # Error occurs with algorithm `small` (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
+    algo = 'small'
+    batch_size = inputs_shape[0]
+    utt.seed_rng()
+    inputs_val = np.random.random(inputs_shape).astype('float32')
+    filters_val = np.random.random(filters_shape).astype('float32')
+    # Scale down the input values to prevent very large absolute errors
+    # due to float rounding
+    inputs_val /= 10
+    filters_val /= 10
+    inputs = theano.shared(inputs_val)
+    filters = theano.shared(filters_val)
+    if len(inputs_shape) == 5:
+        dnn_func = dnn.dnn_conv3d
+    else:
+        dnn_func = dnn.dnn_conv
+    conv = dnn_func(img=inputs, kerns=filters, algo=algo)
+    # Just compute first and last outputs, to reduce execution time.
+    sub_conv_top = dnn_func(img=inputs[:batch_sub], kerns=filters, algo=algo)
+    sub_conv_bottom = dnn_func(img=inputs[(batch_size - batch_sub):], kerns=filters, algo=algo)
+    f = theano.function([], [conv, sub_conv_top, sub_conv_bottom], mode=mode_with_gpu)
+    res_all, res_batch_top, res_batch_bottom = f()
+    for i in range(batch_sub):
+        # Check first ouputs.
+        utt.assert_allclose(res_batch_top[i], res_all[i])
+        # Then check last outputs.
+        p = batch_size - batch_sub + i
+        # It seems there is a limit batch size of 65536  with algorithm `small`.
+        checked_limit = 2**16
+        if p >= checked_limit:
+            # It seems results are repeated in the entire conv.
+            # It should not happen.
+            if np.allclose(res_all[p % checked_limit], res_all[p]):
+                print('\nconv[%d] == conv[%d] == %s' % (p % checked_limit, p, res_all[p]))
+        utt.assert_allclose(res_batch_bottom[i], res_all[p])
+def test_batched_conv_small():
+    # OK
+    yield (run_conv_small_batched_vs_multicall, (65536, 2, 2, 2), (1, 2, 2, 2), 5)
+    # Should fail with cuDNN < V6020, but there's currently a workaround in `dnn_fwd.c` for that case.
+    yield (run_conv_small_batched_vs_multicall, (65537, 2, 2, 2), (1, 2, 2, 2), 5)
+def test_batched_conv3d_small():
+    # OK
+    yield (run_conv_small_batched_vs_multicall, (65536, 2, 2, 2, 2), (1, 2, 2, 2, 2), 5)
+    # Should fail with cuDNN < V6020, but there's currently a workaround in `dnn_fwd.c` for that case.
+    yield (run_conv_small_batched_vs_multicall, (65537, 2, 2, 2, 2), (1, 2, 2, 2, 2), 5)
 def test_conv3d_fwd():
    if not dnn.dnn_available(test_ctx_name):