提交 c2c28793 authored 作者: notoraptor's avatar notoraptor

Add optimizations for SUM(ABS(X)).

上级 53bd748f
...@@ -3755,13 +3755,18 @@ def local_dnn_reduction(node): ...@@ -3755,13 +3755,18 @@ def local_dnn_reduction(node):
scal = node.op.scalar_op.name scal = node.op.scalar_op.name
post = _identity post = _identity
if (isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr) and if node.op.pre_scalar_op is not None:
isinstance(node.op.scalar_op, theano.scalar.basic.Add)): # Might want to handle absmax, avg, and other cases for (norm1, norm2) here
scal = 'norm2' if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
post = _square if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
elif node.op.pre_scalar_op is not None: scal = 'norm2'
# Might want to handle absmax, avg, norm1, and other cases of norm2 here post = _square
return elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
scal = 'norm1'
else:
return
else:
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(scal): if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return return
......
...@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs): ...@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
def local_gpu_elemwise_careduce(node): def local_gpu_elemwise_careduce(node):
""" """
Merge some GpuCAReduceCuda and GPUElemwise. Merge some GpuCAReduceCuda and GPUElemwise.
Currently merged:
- SUM(X^2)
- SUM(ABS(X))
""" """
if (isinstance(node.op, GpuCAReduceCuda) and if (isinstance(node.op, GpuCAReduceCuda) and
...@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node): ...@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably results # operation with some reduction pattern will probably results
# in slow down. # in slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
scalar.basic.Abs))):
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
props = node.op._props_dict() props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
with inherit_stack_trace(node.outputs): with inherit_stack_trace(node.outputs):
out = GpuCAReduceCuda(**props)(inp) out = GpuCAReduceCuda(**props)(inp)
return [out] return [out]
......
...@@ -1583,6 +1583,20 @@ def test_dnn_reduction_sum_squares(): ...@@ -1583,6 +1583,20 @@ def test_dnn_reduction_sum_squares():
utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val)) utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
def test_dnn_reduction_sum_abs():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
def dnn_reduction_strides(shp, shuffle, slice): def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed() utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp), inp = GpuArrayType('float32', (False,) * len(shp),
......
...@@ -362,6 +362,7 @@ def test_pdbbreakpoint_op(): ...@@ -362,6 +362,7 @@ def test_pdbbreakpoint_op():
def test_local_gpu_elemwise_careduce(): def test_local_gpu_elemwise_careduce():
mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn') mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
x = theano.tensor.matrix() x = theano.tensor.matrix()
o = (x * x).sum() o = (x * x).sum()
f = theano.function([x], o, mode=mode_with_gpu_no_cudnn) f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
...@@ -381,6 +382,26 @@ def test_local_gpu_elemwise_careduce(): ...@@ -381,6 +382,26 @@ def test_local_gpu_elemwise_careduce():
assert _check_stack_trace(f) assert _check_stack_trace(f)
utt.assert_allclose(f(data), (data * data).sum(axis=1)) utt.assert_allclose(f(data), (data * data).sum(axis=1))
#
o = abs(x).sum()
f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[1].op, GpuCAReduceCuda)
assert topo[1].op.pre_scalar_op == theano.scalar.abs_
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(f(data), np.abs(data).sum())
o = abs(x).sum(axis=1)
f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[1].op, GpuCAReduceCuda)
assert topo[1].op.pre_scalar_op == theano.scalar.abs_
assert _check_stack_trace(f)
utt.assert_allclose(f(data), np.abs(data).sum(axis=1))
def test_local_lift_dot22scalar(): def test_local_lift_dot22scalar():
x = tensor.matrix() x = tensor.matrix()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论