提交 3b18fef4 authored 作者: Frederic Bastien's avatar Frederic Bastien

added an optimization phase: specialize_device and put the new POW optimization into that phase.

上级 b91efd13
...@@ -137,6 +137,8 @@ optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable s ...@@ -137,6 +137,8 @@ optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable s
1.5, 'fast_run') 1.5, 'fast_run')
optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed
2, 'fast_run') 2, 'fast_run')
optdb.register('specialize_device', gof.EquilibriumDB(), # misc special cases for speed that are dependent on the device.
48.6, 'fast_run')#must be after gpu stuff at 48.5
optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge
49, 'fast_run') 49, 'fast_run')
optdb.register('add_destroy_handler', AddDestroyHandler(), optdb.register('add_destroy_handler', AddDestroyHandler(),
......
...@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs): ...@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags) compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
return lopt return lopt
def register_specialize_device(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
return lopt
def register_stabilize(lopt, *tags, **kwargs): def register_stabilize(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags) compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
...@@ -1829,9 +1834,31 @@ def local_pow_specialize(node): ...@@ -1829,9 +1834,31 @@ def local_pow_specialize(node):
rval = [T.inv(xsym)] rval = [T.inv(xsym)]
if N.all(y == -2): if N.all(y == -2):
rval = [T.inv(T.sqr(xsym))] rval = [T.inv(T.sqr(xsym))]
if rval:
rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval
else:
return False
register_specialize(local_pow_specialize)
# Optimize all integral powers in [-RANGE, RANGE] @register_specialize_device
if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu! @gof.local_optimizer([T.pow])
def local_pow_specialize_device(node):
"""
This optimization is not the same on all device. We do it only on cpu here.
"""
if node.op == T.pow:
#the idea here is that we have pow(x, y)
odtype = node.outputs[0].dtype
xsym = node.inputs[0]
ysym = node.inputs[1]
y = local_mul_canonizer.get_constant(ysym)
if (y is not None) \
and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
rval = None
# 512 is too small for the cpu and too big for some gpu!
if abs(y)==int(abs(y)) and abs(y) <= 512:
pow2 = [xsym] pow2 = [xsym]
pow2_scal = [theano.scalar.Scalar(xsym.dtype)()] pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
y_to_do = abs(y) y_to_do = abs(y)
...@@ -1861,14 +1888,7 @@ def local_pow_specialize(node): ...@@ -1861,14 +1888,7 @@ def local_pow_specialize(node):
rval[0] = T.cast(rval[0], odtype) rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs) assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval return rval
else:
return False
register_specialize(local_pow_specialize)
theano.configparser.AddConfigVar('experimental.pow',
"Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
theano.configparser.BoolParam(False),
)
@gof.local_optimizer([T.mul]) @gof.local_optimizer([T.mul])
def local_mul_specialize(node): def local_mul_specialize(node):
"""Remove special-case constants from mul arguments """Remove special-case constants from mul arguments
......
...@@ -1259,6 +1259,7 @@ def test_local_pow_specialize(): ...@@ -1259,6 +1259,7 @@ def test_local_pow_specialize():
v = T.vector() v = T.vector()
val = numpy.arange(10,dtype=theano.config.floatX) val = numpy.arange(10,dtype=theano.config.floatX)
val_no0 = numpy.arange(1,10,dtype=theano.config.floatX) val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
f = function([v], v**0, mode=mode) f = function([v], v**0, mode=mode)
nodes = [node.op for node in f.maker.env.toposort()] nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [Shape_i(0), T.alloc] assert nodes == [Shape_i(0), T.alloc]
...@@ -1300,33 +1301,44 @@ def test_local_pow_specialize(): ...@@ -1300,33 +1301,44 @@ def test_local_pow_specialize():
# assert nodes == [T.sqrt,T.inv]#Why this don't work? # assert nodes == [T.sqrt,T.inv]#Why this don't work?
assert numpy.allclose(f(val_no0),val_no0**(-.5)) assert numpy.allclose(f(val_no0),val_no0**(-.5))
if config.experimental.pow: def test_local_pow_specialize_device():
print "Test experimental.pow=True"
f = function([v], v**(15), mode=mode) # test that on cpu we use more agressive optimization
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==1 mode = theano.config.mode
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) if mode == 'FAST_COMPILE':
assert numpy.allclose(f(val),val**15) mode = 'FAST_RUN'
mode = compile.mode.get_mode(mode)
f = function([v], v**(-15), mode=mode) mode = mode.excluding('fusion').excluding('gpu')
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==2 v = T.vector()
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) val = numpy.arange(10,dtype=theano.config.floatX)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv) val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
assert numpy.allclose(f(val_no0),val_no0**(-15)) f = function([v], v**(15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
f = function([v], v**(16), mode=mode) assert len(nodes)==1
nodes = [node.op for node in f.maker.env.toposort()] assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert len(nodes) == 1 assert numpy.allclose(f(val),val**15)
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16) f = function([v], v**(-15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
f = function([v], v**(-16), mode=mode) assert len(nodes)==2
nodes = [node.op for node in f.maker.env.toposort()] assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert len(nodes) == 2 assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) assert numpy.allclose(f(val_no0),val_no0**(-15))
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16)) f = function([v], v**(16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 1
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16)
f = function([v], v**(-16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 2
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16))
class T_Rebroadcast(unittest.TestCase): class T_Rebroadcast(unittest.TestCase):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论