提交 d57d2f4c authored 作者: notoraptor's avatar notoraptor

Suggest a fix (and tests) for gpudnnreduction when axes to reduce have size 1.

上级 d395439a
...@@ -97,6 +97,49 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input, ...@@ -97,6 +97,49 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err)); PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
return 1; return 1;
} }
if (rsz == 1 && cudnnGetVersion() <= 7004) {
/* We must reduce some dimensions which have all size 1.
* cuDNN (up to 7004) does not support this case. Let's use GpuElemwise. */
switch (params->red_op) {
// Nothing to do for following cases.
case CUDNN_REDUCE_TENSOR_ADD: break;
case CUDNN_REDUCE_TENSOR_MUL: break;
case CUDNN_REDUCE_TENSOR_MIN: break;
case CUDNN_REDUCE_TENSOR_MAX: break;
case CUDNN_REDUCE_TENSOR_AVG: break;
/* Work to do for following cases.
AMAX (maximum on absolute values) => apply abs(output)
NORM1 (addition of absolute values) => apply abs(output)
NORM2 (square root of sum of squares) => sqroot(output^2) => abs(output)
So, we must apply abs(output) for all following cases.
*/
case CUDNN_REDUCE_TENSOR_AMAX:
case CUDNN_REDUCE_TENSOR_NORM1:
case CUDNN_REDUCE_TENSOR_NORM2:
{
gpuelemwise_arg arg;
arg.name = "out";
arg.typecode = (*output)->ga.typecode;
arg.flags = GE_READ | GE_WRITE;
GpuElemwise* elemwise = GpuElemwise_new(c->ctx, "", "out = (out < 0 ? -out : out)", 1, &arg, p, GE_CONVERT_F16);
if (!elemwise) {
PyErr_SetString(PyExc_RuntimeError, "Unable to create GpuElemwise for output.");
return 1;
}
void* args[1] = { (void*)&(*output)->ga };
int err = GpuElemwise_call(elemwise, args, 0);
GpuElemwise_free(elemwise);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Unable to call GpuElemwise on output.");
return 1;
};
}
break;
default: break;
}
}
if (indices != NULL) { if (indices != NULL) {
// All indices will be 0 since the size of the reduced area is 1. // All indices will be 0 since the size of the reduced area is 1.
err = GpuArray_memset(&(*indices)->ga, 0); err = GpuArray_memset(&(*indices)->ga, 0);
......
...@@ -1611,6 +1611,32 @@ def test_dnn_reduction_absmax(): ...@@ -1611,6 +1611,32 @@ def test_dnn_reduction_absmax():
utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val)) utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
def test_dnn_reduction_axis_size_one():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
for dtype in ('float16', 'float32', 'float64'):
for shape, axis in [[(1, 2, 3), 0], [(2, 1, 3), 1], [(2, 3, 1), 2]]:
x = theano.tensor.tensor3(dtype=dtype)
sum = x.sum(axis=axis)
sum_squares = (x**2).sum(axis=axis)
sum_abs = abs(x).sum(axis=axis)
absmax = abs(x).max(axis=axis)
f1 = theano.function([x], sum, mode=mode_with_gpu)
f2 = theano.function([x], sum_squares, mode=mode_with_gpu)
f3 = theano.function([x], sum_abs, mode=mode_with_gpu)
f4 = theano.function([x], absmax, mode=mode_with_gpu)
for fn in (f1, f2, f3, f4):
assert any(isinstance(node.op, dnn.GpuDnnReduction) for node in fn.maker.fgraph.apply_nodes)
xval = np.random.uniform(-10, -1, size=shape).astype(dtype)
xval_reshaped = xval.reshape(shape[:axis] + shape[(axis + 1):])
test_val = abs(xval_reshaped)
val_sum, val_sum_squares, val_sum_abs, val_absmax = f1(xval), f2(xval), f3(xval), f4(xval)
utt.assert_allclose(xval_reshaped, val_sum)
utt.assert_allclose(test_val**2, val_sum_squares)
utt.assert_allclose(test_val, val_sum_abs)
utt.assert_allclose(test_val, val_absmax)
def dnn_reduction_strides(shp, shuffle, slice): def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed() utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp), inp = GpuArrayType('float32', (False,) * len(shp),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论