merge

0cc1268b · James Bergstra · 3afc64ab · 3b18fef4 · 0cc1268b · 0cc1268b
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -137,6 +137,8 @@ optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable s
        1.5, 'fast_run')          
 optdb.register('specialize', gof.EquilibriumDB(),           # misc special cases for speed
        2, 'fast_run')
+optdb.register('specialize_device', gof.EquilibriumDB(),           # misc special cases for speed that are dependent on the device.
+        48.6, 'fast_run')#must be after gpu stuff at 48.5
 optdb.register('merge2', gof.MergeOptimizer(),              # especially constant merge
        49, 'fast_run')
 optdb.register('add_destroy_handler', AddDestroyHandler(), 

--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -4,18 +4,22 @@ from theano import shared, function
 import theano.tensor as T
 from neighbours import images2neibs, neibs2images
+mode = theano.config.mode
+if mode=="FAST_COMPILE":
+    mode='FAST_RUN'
 def neibs_test():
    shape = (100,40,18,18)
    images = shared(arange(prod(shape), dtype='float32').reshape(shape))
    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
-    f = function([], images2neibs(images, neib_shape))
+    f = function([], images2neibs(images, neib_shape), mode=mode)
    #print images.value
    neibs = f()
    #print neibs
-    g = function([], neibs2images(neibs, neib_shape, images.shape))
+    g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode)
    #print g()
    assert allclose(images.value,g())

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -499,11 +499,15 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
 def test_multinomial():
    steps = 100
+    mode_ = mode
+    if mode == 'FAST_COMPILE': 
+        mode_ = 'FAST_RUN'
    if mode in ['DEBUG_MODE','FAST_COMPILE']:
        sample_size = (49,5)
    else:
        sample_size = (450,6)
+    mode_ = theano.compile.mode.get_mode(mode_)
    print ''
    print 'ON CPU:'
@@ -511,7 +515,7 @@ def test_multinomial():
    pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
    m = R.multinomial(pvals=pvals, dtype=config.floatX)
-    f = theano.function([], m, mode=mode)
+    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
    basic_multinomialtest(f, steps, pvals, prefix='mrg ')
@@ -526,7 +530,7 @@ def test_multinomial():
        assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
        f = theano.function([], theano.Out(
            theano.sandbox.cuda.basic_ops.gpu_from_host(n),
-            borrow=True), mode=mode_with_gpu)
+            borrow=True), mode=mode_.including('gpu'))
        theano.printing.debugprint(f)
        sys.stdout.flush()

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
    return lopt
+def register_specialize_device(lopt, *tags, **kwargs):
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
+    return lopt
 def register_stabilize(lopt, *tags, **kwargs):
    name = (kwargs and kwargs.pop('name')) or lopt.__name__
    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
@@ -1609,6 +1614,8 @@ def local_sum_div_dimshuffle(node):
    if isinstance(node.op, T.Sum):
        axis = node.op.axis
+        if axis is None:
+            axis = range(node.inputs[0].ndim)
        #print 'axis =', axis
        thing_summed = node.inputs[0]
        dimshuffled = None
@@ -1827,9 +1834,31 @@ def local_pow_specialize(node):
                rval = [T.inv(xsym)]
            if N.all(y == -2):
                rval = [T.inv(T.sqr(xsym))]
+            if rval:
+                rval[0] = T.cast(rval[0], odtype)
+                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
+                return rval
+    else:
+        return False
+register_specialize(local_pow_specialize)
-            # Optimize all integral powers in [-RANGE, RANGE]
+@register_specialize_device
-            if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
+@gof.local_optimizer([T.pow])
+def local_pow_specialize_device(node):
+    """
+    This optimization is not the same on all device. We do it only on cpu here.
+    """
+    if node.op == T.pow:
+        #the idea here is that we have pow(x, y)
+        odtype = node.outputs[0].dtype
+        xsym = node.inputs[0]
+        ysym = node.inputs[1]
+        y = local_mul_canonizer.get_constant(ysym)
+        if (y is not None) \
+                and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
+            rval = None
+            # 512 is too small for the cpu and too big for some gpu!
+            if abs(y)==int(abs(y)) and abs(y) <= 512:
                pow2 = [xsym]
                pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
                y_to_do = abs(y)
@@ -1859,14 +1888,7 @@ def local_pow_specialize(node):
                rval[0] = T.cast(rval[0], odtype)
                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
                return rval
-    else:
-        return False
-register_specialize(local_pow_specialize)
-theano.configparser.AddConfigVar('experimental.pow',
-        "Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
-        theano.configparser.BoolParam(False),
-        )
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):
    """Remove special-case constants from mul arguments

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -14,7 +14,6 @@ from theano.gof import Env
 from theano.tensor.elemwise import DimShuffle
 from theano import pprint, shared
 from theano.tests import unittest_tools as utt
-import scalar as scal
 from theano import function, compile
 from nose.plugins.skip import SkipTest
@@ -1260,6 +1259,7 @@ def test_local_pow_specialize():
    v = T.vector()
    val = numpy.arange(10,dtype=theano.config.floatX)
    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
    f = function([v], v**0, mode=mode)
    nodes = [node.op for node in f.maker.env.toposort()]
    assert nodes == [Shape_i(0), T.alloc]
@@ -1301,33 +1301,44 @@ def test_local_pow_specialize():
 #    assert nodes == [T.sqrt,T.inv]#Why this don't work?
    assert numpy.allclose(f(val_no0),val_no0**(-.5))
-    if config.experimental.pow:
+def test_local_pow_specialize_device():
-        print "Test experimental.pow=True"
-        f = function([v], v**(15), mode=mode)
+    # test that on cpu we use more agressive optimization
-        nodes = [node.op for node in f.maker.env.toposort()]
-        assert len(nodes)==1
+    mode = theano.config.mode
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    if mode == 'FAST_COMPILE':
-        assert numpy.allclose(f(val),val**15)
+       mode = 'FAST_RUN'
+    mode = compile.mode.get_mode(mode)
-        f = function([v], v**(-15), mode=mode)
+    mode = mode.excluding('fusion').excluding('gpu')
-        nodes = [node.op for node in f.maker.env.toposort()]
-        assert len(nodes)==2
+    v = T.vector()
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    val = numpy.arange(10,dtype=theano.config.floatX)
-        assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
+    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
-        assert numpy.allclose(f(val_no0),val_no0**(-15))
+    f = function([v], v**(15), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
-        f = function([v], v**(16), mode=mode)
+    assert len(nodes)==1
-        nodes = [node.op for node in f.maker.env.toposort()]
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
-        assert len(nodes) == 1
+    assert numpy.allclose(f(val),val**15)
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
-        assert numpy.allclose(f(val),val**16)
+    f = function([v], v**(-15), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
-        f = function([v], v**(-16), mode=mode)
+    assert len(nodes)==2
-        nodes = [node.op for node in f.maker.env.toposort()]
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
-        assert len(nodes) == 2
+    assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    assert numpy.allclose(f(val_no0),val_no0**(-15))
-        assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
-        assert numpy.allclose(f(val_no0),val_no0**(-16))
+    f = function([v], v**(16), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert len(nodes) == 1
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    assert numpy.allclose(f(val),val**16)
+    f = function([v], v**(-16), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert len(nodes) == 2
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
+    assert numpy.allclose(f(val_no0),val_no0**(-16))
 class T_Rebroadcast(unittest.TestCase):