提交 0cc1268b authored 作者: James Bergstra's avatar James Bergstra

merge

...@@ -137,6 +137,8 @@ optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable s ...@@ -137,6 +137,8 @@ optdb.register('stabilize', gof.EquilibriumDB(), # replace unstable s
1.5, 'fast_run') 1.5, 'fast_run')
optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed optdb.register('specialize', gof.EquilibriumDB(), # misc special cases for speed
2, 'fast_run') 2, 'fast_run')
optdb.register('specialize_device', gof.EquilibriumDB(), # misc special cases for speed that are dependent on the device.
48.6, 'fast_run')#must be after gpu stuff at 48.5
optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge optdb.register('merge2', gof.MergeOptimizer(), # especially constant merge
49, 'fast_run') 49, 'fast_run')
optdb.register('add_destroy_handler', AddDestroyHandler(), optdb.register('add_destroy_handler', AddDestroyHandler(),
......
...@@ -4,18 +4,22 @@ from theano import shared, function ...@@ -4,18 +4,22 @@ from theano import shared, function
import theano.tensor as T import theano.tensor as T
from neighbours import images2neibs, neibs2images from neighbours import images2neibs, neibs2images
mode = theano.config.mode
if mode=="FAST_COMPILE":
mode='FAST_RUN'
def neibs_test(): def neibs_test():
shape = (100,40,18,18) shape = (100,40,18,18)
images = shared(arange(prod(shape), dtype='float32').reshape(shape)) images = shared(arange(prod(shape), dtype='float32').reshape(shape))
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32')) neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
f = function([], images2neibs(images, neib_shape)) f = function([], images2neibs(images, neib_shape), mode=mode)
#print images.value #print images.value
neibs = f() neibs = f()
#print neibs #print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape)) g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode)
#print g() #print g()
assert allclose(images.value,g()) assert allclose(images.value,g())
......
...@@ -499,11 +499,15 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04): ...@@ -499,11 +499,15 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
def test_multinomial(): def test_multinomial():
steps = 100 steps = 100
mode_ = mode
if mode == 'FAST_COMPILE':
mode_ = 'FAST_RUN'
if mode in ['DEBUG_MODE','FAST_COMPILE']: if mode in ['DEBUG_MODE','FAST_COMPILE']:
sample_size = (49,5) sample_size = (49,5)
else: else:
sample_size = (450,6) sample_size = (450,6)
mode_ = theano.compile.mode.get_mode(mode_)
print '' print ''
print 'ON CPU:' print 'ON CPU:'
...@@ -511,7 +515,7 @@ def test_multinomial(): ...@@ -511,7 +515,7 @@ def test_multinomial():
pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals) pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
R = MRG_RandomStreams(234, use_cuda=False) R = MRG_RandomStreams(234, use_cuda=False)
m = R.multinomial(pvals=pvals, dtype=config.floatX) m = R.multinomial(pvals=pvals, dtype=config.floatX)
f = theano.function([], m, mode=mode) f = theano.function([], m, mode=mode_)
theano.printing.debugprint(f) theano.printing.debugprint(f)
basic_multinomialtest(f, steps, pvals, prefix='mrg ') basic_multinomialtest(f, steps, pvals, prefix='mrg ')
...@@ -526,7 +530,7 @@ def test_multinomial(): ...@@ -526,7 +530,7 @@ def test_multinomial():
assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
f = theano.function([], theano.Out( f = theano.function([], theano.Out(
theano.sandbox.cuda.basic_ops.gpu_from_host(n), theano.sandbox.cuda.basic_ops.gpu_from_host(n),
borrow=True), mode=mode_with_gpu) borrow=True), mode=mode_.including('gpu'))
theano.printing.debugprint(f) theano.printing.debugprint(f)
sys.stdout.flush() sys.stdout.flush()
......
...@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs): ...@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags) compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
return lopt return lopt
def register_specialize_device(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
return lopt
def register_stabilize(lopt, *tags, **kwargs): def register_stabilize(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags) compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
...@@ -1609,6 +1614,8 @@ def local_sum_div_dimshuffle(node): ...@@ -1609,6 +1614,8 @@ def local_sum_div_dimshuffle(node):
if isinstance(node.op, T.Sum): if isinstance(node.op, T.Sum):
axis = node.op.axis axis = node.op.axis
if axis is None:
axis = range(node.inputs[0].ndim)
#print 'axis =', axis #print 'axis =', axis
thing_summed = node.inputs[0] thing_summed = node.inputs[0]
dimshuffled = None dimshuffled = None
...@@ -1827,9 +1834,31 @@ def local_pow_specialize(node): ...@@ -1827,9 +1834,31 @@ def local_pow_specialize(node):
rval = [T.inv(xsym)] rval = [T.inv(xsym)]
if N.all(y == -2): if N.all(y == -2):
rval = [T.inv(T.sqr(xsym))] rval = [T.inv(T.sqr(xsym))]
if rval:
rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval
else:
return False
register_specialize(local_pow_specialize)
# Optimize all integral powers in [-RANGE, RANGE] @register_specialize_device
if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu! @gof.local_optimizer([T.pow])
def local_pow_specialize_device(node):
"""
This optimization is not the same on all device. We do it only on cpu here.
"""
if node.op == T.pow:
#the idea here is that we have pow(x, y)
odtype = node.outputs[0].dtype
xsym = node.inputs[0]
ysym = node.inputs[1]
y = local_mul_canonizer.get_constant(ysym)
if (y is not None) \
and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
rval = None
# 512 is too small for the cpu and too big for some gpu!
if abs(y)==int(abs(y)) and abs(y) <= 512:
pow2 = [xsym] pow2 = [xsym]
pow2_scal = [theano.scalar.Scalar(xsym.dtype)()] pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
y_to_do = abs(y) y_to_do = abs(y)
...@@ -1859,14 +1888,7 @@ def local_pow_specialize(node): ...@@ -1859,14 +1888,7 @@ def local_pow_specialize(node):
rval[0] = T.cast(rval[0], odtype) rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs) assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
return rval return rval
else:
return False
register_specialize(local_pow_specialize)
theano.configparser.AddConfigVar('experimental.pow',
"Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
theano.configparser.BoolParam(False),
)
@gof.local_optimizer([T.mul]) @gof.local_optimizer([T.mul])
def local_mul_specialize(node): def local_mul_specialize(node):
"""Remove special-case constants from mul arguments """Remove special-case constants from mul arguments
......
...@@ -14,7 +14,6 @@ from theano.gof import Env ...@@ -14,7 +14,6 @@ from theano.gof import Env
from theano.tensor.elemwise import DimShuffle from theano.tensor.elemwise import DimShuffle
from theano import pprint, shared from theano import pprint, shared
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import scalar as scal
from theano import function, compile from theano import function, compile
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
...@@ -1260,6 +1259,7 @@ def test_local_pow_specialize(): ...@@ -1260,6 +1259,7 @@ def test_local_pow_specialize():
v = T.vector() v = T.vector()
val = numpy.arange(10,dtype=theano.config.floatX) val = numpy.arange(10,dtype=theano.config.floatX)
val_no0 = numpy.arange(1,10,dtype=theano.config.floatX) val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
f = function([v], v**0, mode=mode) f = function([v], v**0, mode=mode)
nodes = [node.op for node in f.maker.env.toposort()] nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [Shape_i(0), T.alloc] assert nodes == [Shape_i(0), T.alloc]
...@@ -1301,33 +1301,44 @@ def test_local_pow_specialize(): ...@@ -1301,33 +1301,44 @@ def test_local_pow_specialize():
# assert nodes == [T.sqrt,T.inv]#Why this don't work? # assert nodes == [T.sqrt,T.inv]#Why this don't work?
assert numpy.allclose(f(val_no0),val_no0**(-.5)) assert numpy.allclose(f(val_no0),val_no0**(-.5))
if config.experimental.pow: def test_local_pow_specialize_device():
print "Test experimental.pow=True"
f = function([v], v**(15), mode=mode) # test that on cpu we use more agressive optimization
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==1 mode = theano.config.mode
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) if mode == 'FAST_COMPILE':
assert numpy.allclose(f(val),val**15) mode = 'FAST_RUN'
mode = compile.mode.get_mode(mode)
f = function([v], v**(-15), mode=mode) mode = mode.excluding('fusion').excluding('gpu')
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==2 v = T.vector()
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) val = numpy.arange(10,dtype=theano.config.floatX)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv) val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
assert numpy.allclose(f(val_no0),val_no0**(-15)) f = function([v], v**(15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
f = function([v], v**(16), mode=mode) assert len(nodes)==1
nodes = [node.op for node in f.maker.env.toposort()] assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert len(nodes) == 1 assert numpy.allclose(f(val),val**15)
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16) f = function([v], v**(-15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
f = function([v], v**(-16), mode=mode) assert len(nodes)==2
nodes = [node.op for node in f.maker.env.toposort()] assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert len(nodes) == 2 assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite) assert numpy.allclose(f(val_no0),val_no0**(-15))
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16)) f = function([v], v**(16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 1
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16)
f = function([v], v**(-16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 2
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16))
class T_Rebroadcast(unittest.TestCase): class T_Rebroadcast(unittest.TestCase):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论