提交 184216ae authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2

Optimize SUM(x^2), SUM(ABS(X)) and MAX(ABS(X)) to cuDNN reduction.
...@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase): ...@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase):
assert(dims[2] == 1); assert(dims[2] == 1);
assert(dims[1] == 1); assert(dims[1] == 1);
%(b)s = pygpu_view(%(w)s, Py_None); %(b)s = pygpu_view(%(w)s, Py_None);
%(b)s->ga.offset = off; %(b)s->ga.offset += off;
%(b)s->ga.dimensions[0] = dims[0]; %(b)s->ga.dimensions[0] = dims[0];
GpuArray_fix_flags(&%(b)s->ga); GpuArray_fix_flags(&%(b)s->ga);
bshp = dims[0]; bshp = dims[0];
...@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase): ...@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase):
assert(dims[2] == 1); assert(dims[2] == 1);
// We assume that the typecode matches // We assume that the typecode matches
%(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1); %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
%(m)s->ga.offset = off; %(m)s->ga.offset += off;
assert(dims[0] %% bshp == 0); assert(dims[0] %% bshp == 0);
%(m)s->ga.dimensions[0] = dims[0] / bshp; %(m)s->ga.dimensions[0] = dims[0] / bshp;
%(m)s->ga.dimensions[1] = bshp; %(m)s->ga.dimensions[1] = bshp;
...@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase): ...@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (3, version()) return (4, version())
def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode): def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
...@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node): ...@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node):
node.op.acc_dtype == 'float64'): node.op.acc_dtype == 'float64'):
return return
def _identity(a):
return a
def _square(a):
return GpuElemwise(theano.scalar.basic.sqr)(a)
scal = node.op.scalar_op.name
post = _identity
if node.op.pre_scalar_op is not None: if node.op.pre_scalar_op is not None:
# Might want to handle absmax, avg, norm1, norm2 here # Might want to handle absmax, avg, and other cases for (norm1, norm2) here
return if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
scal = 'norm2'
post = _square
elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
scal = 'norm1'
else:
return
elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
scal = 'absmax'
else:
return
if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name): if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
return return
with inherit_stack_trace(node.outputs): with inherit_stack_trace(node.outputs):
return (GpuDnnReduction(node.op.scalar_op.name, ret = GpuDnnReduction(scal,
node.op.axis, node.op.axis,
node.op.acc_dtype, node.op.acc_dtype,
node.op.dtype, node.op.dtype,
False)(node.inputs[0]),) False)(node.inputs[0])
return [post(ret)]
@register_opt('cudnn') @register_opt('cudnn')
......
...@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs): ...@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
return False return False
x, = inputs x, = inputs
idtype = x.dtype idtype = x.dtype
adtype = getattr(op, 'acc_dtype', None) adtype = getattr(op, 'acc_dtype', idtype)
odtype = getattr(op, 'dtype', outputs[0].dtype) odtype = getattr(op, 'dtype', outputs[0].dtype)
# Force accumulator to float32 for float32 inputs since tree # Force accumulator to float32 for float32 inputs since tree
...@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs): ...@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
def local_gpu_elemwise_careduce(node): def local_gpu_elemwise_careduce(node):
""" """
Merge some GpuCAReduceCuda and GPUElemwise. Merge some GpuCAReduceCuda and GPUElemwise.
Currently merged:
- SUM(X^2)
- SUM(ABS(X))
""" """
if (isinstance(node.op, GpuCAReduceCuda) and if (isinstance(node.op, GpuCAReduceCuda) and
...@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node): ...@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably results # operation with some reduction pattern will probably results
# in slow down. # in slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
scalar.basic.Abs))):
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
props = node.op._props_dict() props = node.op._props_dict()
props["pre_scalar_op"] = scalar.basic.sqr props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
with inherit_stack_trace(node.outputs): with inherit_stack_trace(node.outputs):
out = GpuCAReduceCuda(**props)(inp) out = GpuCAReduceCuda(**props)(inp)
return [out] return [out]
......
...@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt(): ...@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt():
yield dnn_reduction, 2, idtype, adtype, odtype yield dnn_reduction, 2, idtype, adtype, odtype
def test_dnn_reduction_sum_squares():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = (M**2).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm2'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
def test_dnn_reduction_sum_abs():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).sum(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
def test_dnn_reduction_absmax():
if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
raise SkipTest(dnn.dnn_available.msg)
M = T.matrix()
for axis in (None, 0, 1):
out = abs(M).max(axis=axis)
f = theano.function([M], out, mode=mode_with_gpu)
assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'absmax'
for node in f.maker.fgraph.apply_nodes)
M_val = np.random.random((4, 5)).astype(theano.config.floatX)
utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
def dnn_reduction_strides(shp, shuffle, slice): def dnn_reduction_strides(shp, shuffle, slice):
utt.fetch_seed() utt.fetch_seed()
inp = GpuArrayType('float32', (False,) * len(shp), inp = GpuArrayType('float32', (False,) * len(shp),
......
...@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op(): ...@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op():
def test_local_gpu_elemwise_careduce(): def test_local_gpu_elemwise_careduce():
mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
x = theano.tensor.matrix() x = theano.tensor.matrix()
o = (x * x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(f(data), (data * data).sum())
o = (x * x).sum(axis=1) def fn_sum_square(x, axis):
f = theano.function([x], o, mode=mode_with_gpu) return (x * x).sum(axis=axis)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3 def fn_sum_abs(x, axis):
assert topo[1].op.pre_scalar_op == theano.scalar.sqr return abs(x).sum(axis=axis)
assert _check_stack_trace(f)
utt.assert_allclose(f(data), (data * data).sum(axis=1)) def fn_max_abs(x, axis):
return abs(x).max(axis=axis)
for fn, pre_scalar_op in ((fn_sum_square, theano.scalar.sqr),
(fn_sum_abs, theano.scalar.abs_),
(fn_max_abs, theano.scalar.abs_)):
for axis in (None, 0, 1):
o = fn(x, axis)
f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[1].op, GpuCAReduceCuda)
assert topo[1].op.pre_scalar_op == pre_scalar_op
assert _check_stack_trace(f)
data = np.random.rand(3, 4).astype(theano.config.floatX)
utt.assert_allclose(fn(data, axis), f(data))
def test_local_lift_dot22scalar(): def test_local_lift_dot22scalar():
......
...@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle, ...@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
Prod, ProdWithoutZeros) Prod, ProdWithoutZeros)
from theano.tests import unittest_tools from theano.tests import unittest_tools
from theano.tests.unittest_tools import attr from theano.tests.unittest_tools import attr
import theano.tests.unittest_tools as utt
def FunctionGraph(i, o): def FunctionGraph(i, o):
...@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
try: try:
f_xv = f(xv) f_xv = f(xv)
self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv)) self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
self.assertTrue(np.allclose(f_xv, zv), utt.assert_allclose(zv, f_xv)
(f_xv, zv, xsh, tosum))
except NotImplementedError: except NotImplementedError:
# GpuCAReduce don't implement all cases when size is 0 # GpuCAReduce don't implement all cases when size is 0
assert xv.size == 0 assert xv.size == 0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论