Unverified 提交 342d4d11 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6517 from notoraptor/workaround-cudnn-redux-with-axes-size-one

Suggest a fix (and tests) for gpudnnreduction when axes to reduce have size 1.
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
cudnnTensorDescriptor_t APPLY_SPECIFIC(input); cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output); cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red); cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
GpuElemwise* elemwise;
gpuelemwise_arg arg;
#section init_code_struct #section init_code_struct
...@@ -28,12 +29,18 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red ...@@ -28,12 +29,18 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red
FAIL; FAIL;
} }
elemwise = NULL;
#section cleanup_code_struct #section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); } if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); } if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); } if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }
if (elemwise) {
GpuElemwise_free(elemwise);
elemwise = NULL;
}
#section support_code_struct #section support_code_struct
...@@ -97,6 +104,49 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input, ...@@ -97,6 +104,49 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err)); PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
return 1; return 1;
} }
if (rsz == 1) {
/* We must reduce some dimensions which have all size 1.
* cuDNN (up to 7004) does not support this case. Let's use GpuElemwise. */
switch (params->red_op) {
// Nothing to do for following cases.
case CUDNN_REDUCE_TENSOR_ADD: break;
case CUDNN_REDUCE_TENSOR_MUL: break;
case CUDNN_REDUCE_TENSOR_MIN: break;
case CUDNN_REDUCE_TENSOR_MAX: break;
case CUDNN_REDUCE_TENSOR_AVG: break;
/* Work to do for following cases.
AMAX (maximum on absolute values) => apply abs(output)
NORM1 (addition of absolute values) => apply abs(output)
NORM2 (square root of sum of squares) => sqroot(output^2) => abs(output)
So, we must apply abs(output) for all following cases.
*/
case CUDNN_REDUCE_TENSOR_AMAX:
case CUDNN_REDUCE_TENSOR_NORM1:
case CUDNN_REDUCE_TENSOR_NORM2:
{
if (elemwise == NULL) {
arg.name = "out";
arg.typecode = (*output)->ga.typecode;
arg.flags = GE_READ | GE_WRITE;
elemwise = GpuElemwise_new(c->ctx, "", "out = (out < 0 ? -out : out)", 1, &arg, p, GE_CONVERT_F16);
if (!elemwise) {
PyErr_SetString(PyExc_RuntimeError, "Unable to create GpuElemwise for output.");
return 1;
}
}
void* args[1] = { (void*)&(*output)->ga };
int err = GpuElemwise_call(elemwise, args, 0);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Unable to call GpuElemwise on output.");
return 1;
};
}
break;
default: break;
}
}
if (indices != NULL) { if (indices != NULL) {
// All indices will be 0 since the size of the reduced area is 1. // All indices will be 0 since the size of the reduced area is 1.
err = GpuArray_memset(&(*indices)->ga, 0); err = GpuArray_memset(&(*indices)->ga, 0);
......
...@@ -1258,9 +1258,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1258,9 +1258,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
greduce = op2( greduce = op2(
op.scalar_op, op.scalar_op,
axis=new_axis, reduce_mask=new_mask, axis=new_axis, reduce_mask=new_mask,
dtype=getattr(op, 'dtype', outputs[0].dtype), dtype=odtype,
acc_dtype=getattr(op, 'acc_dtype', None)) acc_dtype=adtype)
with inherit_stack_trace(outputs): with inherit_stack_trace(outputs):
reshaped_x = x.reshape(tensor.stack(new_in_shp)) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
......
...@@ -1611,6 +1611,53 @@ def test_dnn_reduction_absmax(): ...@@ -1611,6 +1611,53 @@ def test_dnn_reduction_absmax():
utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val)) utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
def test_dnn_reduction_axis_size_one():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
for dtype in ('float16', 'float32', 'float64'):
for shape, axis in [[(1, 2, 3), 0],
[(2, 1, 3), 1],
[(2, 3, 1), 2],
[(1, 5, 1), (0, 2)],
[(4, 1, 6, 1), (1, 3)]]:
x = theano.tensor.TensorType(dtype=dtype, broadcastable=[False] * len(shape))()
sum = x.sum(axis=axis)
sum_squares = (x**2).sum(axis=axis)
sum_abs = abs(x).sum(axis=axis)
absmax = abs(x).max(axis=axis)
cpu_f = theano.function([x], [sum, sum_squares, sum_abs, absmax], mode=mode_without_gpu)
f1 = theano.function([x], sum, mode=mode_with_gpu)
f2 = theano.function([x], sum_squares, mode=mode_with_gpu)
f3 = theano.function([x], sum_abs, mode=mode_with_gpu)
f4 = theano.function([x], absmax, mode=mode_with_gpu)
for fn, red_op in ((f1, 'add'), (f2, 'norm2'), (f3, 'norm1'), (f4, 'absmax')):
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == red_op
for node in fn.maker.fgraph.apply_nodes)
xval = np.random.uniform(-10, -1, size=shape).astype(dtype)
if isinstance(axis, int):
xval_reshaped = xval.reshape(shape[:axis] + shape[(axis + 1):])
else:
xval_reshaped = xval.reshape([n for i, n in enumerate(shape) if i not in axis])
test_val = abs(xval_reshaped)
val_sum, val_sum_squares, val_sum_abs, val_absmax = f1(xval), f2(xval), f3(xval), f4(xval)
cpu_val_sum, cpu_val_sum_squares, cpu_val_sum_abs, cpu_val_absmax = cpu_f(xval)
utt.assert_allclose(cpu_val_sum, val_sum)
utt.assert_allclose(cpu_val_sum_squares, val_sum_squares)
utt.assert_allclose(cpu_val_sum_abs, val_sum_abs)
utt.assert_allclose(cpu_val_absmax, val_absmax)
utt.assert_allclose(xval_reshaped, val_sum)
utt.assert_allclose(test_val**2, val_sum_squares)
utt.assert_allclose(test_val, val_sum_abs)
utt.assert_allclose(test_val, val_absmax)
def dnn_reduction_strides(shp, shuffle, slice): def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed() utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp), inp = GpuArrayType('float32', (False,) * len(shp),
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论