提交 02e127c1 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6045 from notoraptor/fix-5985-6020

Work around erroneous dnn_conv results with algo small an large batches
...@@ -163,12 +163,20 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -163,12 +163,20 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
} }
/* These two algos are not supported for 3d conv */ /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if (PyGpuArray_NDIM(input) == 5 && if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
if (cudnnGetVersion() < 6100 &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536)
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation // with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides. // does not support strides.
......
...@@ -1067,6 +1067,63 @@ def get_conv3d_test_cases(): ...@@ -1067,6 +1067,63 @@ def get_conv3d_test_cases():
return itt return itt
def run_conv_small_batched_vs_multicall(inputs_shape, filters_shape, batch_sub):
# Function to check issue #5985 (see tests below): https://github.com/Theano/Theano/issues/5985
# Error occurs with algorithm `small` (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
algo = 'small'
batch_size = inputs_shape[0]
utt.seed_rng()
inputs_val = np.random.random(inputs_shape).astype('float32')
filters_val = np.random.random(filters_shape).astype('float32')
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val /= 10
filters_val /= 10
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
if len(inputs_shape) == 5:
dnn_func = dnn.dnn_conv3d
else:
dnn_func = dnn.dnn_conv
conv = dnn_func(img=inputs, kerns=filters, algo=algo)
# Just compute first and last outputs, to reduce execution time.
sub_conv_top = dnn_func(img=inputs[:batch_sub], kerns=filters, algo=algo)
sub_conv_bottom = dnn_func(img=inputs[(batch_size - batch_sub):], kerns=filters, algo=algo)
f = theano.function([], [conv, sub_conv_top, sub_conv_bottom], mode=mode_with_gpu)
res_all, res_batch_top, res_batch_bottom = f()
for i in range(batch_sub):
# Check first ouputs.
utt.assert_allclose(res_batch_top[i], res_all[i])
# Then check last outputs.
p = batch_size - batch_sub + i
# It seems there is a limit batch size of 65536 with algorithm `small`.
checked_limit = 2**16
if p >= checked_limit:
# It seems results are repeated in the entire conv.
# It should not happen.
if np.allclose(res_all[p % checked_limit], res_all[p]):
print('\nconv[%d] == conv[%d] == %s' % (p % checked_limit, p, res_all[p]))
utt.assert_allclose(res_batch_bottom[i], res_all[p])
def test_batched_conv_small():
# OK
yield (run_conv_small_batched_vs_multicall, (65536, 2, 2, 2), (1, 2, 2, 2), 5)
# Should fail with cuDNN < V6020, but there's currently a workaround in `dnn_fwd.c` for that case.
yield (run_conv_small_batched_vs_multicall, (65537, 2, 2, 2), (1, 2, 2, 2), 5)
def test_batched_conv3d_small():
# OK
yield (run_conv_small_batched_vs_multicall, (65536, 2, 2, 2, 2), (1, 2, 2, 2, 2), 5)
# Should fail with cuDNN < V6020, but there's currently a workaround in `dnn_fwd.c` for that case.
yield (run_conv_small_batched_vs_multicall, (65537, 2, 2, 2, 2), (1, 2, 2, 2, 2), 5)
def test_conv3d_fwd(): def test_conv3d_fwd():
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论