added an optimization phase: specialize_device and put the new POW optimization into that phase.

3b18fef4 · Frederic Bastien · b91efd13 · 3b18fef4 · 3b18fef4 · 3b18fef4
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -137,6 +137,8 @@ optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable s
        1.5, 'fast_run')          
 optdb.register('specialize', gof.EquilibriumDB(),           # misc special cases for speed
        2, 'fast_run')
+optdb.register('specialize_device', gof.EquilibriumDB(),           # misc special cases for speed that are dependent on the device.
+        48.6, 'fast_run')#must be after gpu stuff at 48.5
 optdb.register('merge2', gof.MergeOptimizer(),              # especially constant merge
        49, 'fast_run')
 optdb.register('add_destroy_handler', AddDestroyHandler(), 

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
    return lopt
+def register_specialize_device(lopt, *tags, **kwargs):
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
+    return lopt
 def register_stabilize(lopt, *tags, **kwargs):
    name = (kwargs and kwargs.pop('name')) or lopt.__name__
    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
@@ -1829,9 +1834,31 @@ def local_pow_specialize(node):
                rval = [T.inv(xsym)]
            if N.all(y == -2):
                rval = [T.inv(T.sqr(xsym))]
+            if rval:
+                rval[0] = T.cast(rval[0], odtype)
+                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
+                return rval
+    else:
+        return False
+register_specialize(local_pow_specialize)
-            # Optimize all integral powers in [-RANGE, RANGE]
+@register_specialize_device
-            if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
+@gof.local_optimizer([T.pow])
+def local_pow_specialize_device(node):
+    """
+    This optimization is not the same on all device. We do it only on cpu here.
+    """
+    if node.op == T.pow:
+        #the idea here is that we have pow(x, y)
+        odtype = node.outputs[0].dtype
+        xsym = node.inputs[0]
+        ysym = node.inputs[1]
+        y = local_mul_canonizer.get_constant(ysym)
+        if (y is not None) \
+                and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
+            rval = None
+            # 512 is too small for the cpu and too big for some gpu!
+            if abs(y)==int(abs(y)) and abs(y) <= 512:
                pow2 = [xsym]
                pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
                y_to_do = abs(y)
@@ -1861,13 +1888,6 @@ def local_pow_specialize(node):
                rval[0] = T.cast(rval[0], odtype)
                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
                return rval
-    else:
-        return False
-register_specialize(local_pow_specialize)
-theano.configparser.AddConfigVar('experimental.pow',
-        "Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
-        theano.configparser.BoolParam(False),
-        )
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1259,6 +1259,7 @@ def test_local_pow_specialize():
    v = T.vector()
    val = numpy.arange(10,dtype=theano.config.floatX)
    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
    f = function([v], v**0, mode=mode)
    nodes = [node.op for node in f.maker.env.toposort()]
    assert nodes == [Shape_i(0), T.alloc]
@@ -1300,8 +1301,19 @@ def test_local_pow_specialize():
 #    assert nodes == [T.sqrt,T.inv]#Why this don't work?
    assert numpy.allclose(f(val_no0),val_no0**(-.5))
-    if config.experimental.pow:
+def test_local_pow_specialize_device():
-        print "Test experimental.pow=True"
+    # test that on cpu we use more agressive optimization
+    mode = theano.config.mode
+    if mode == 'FAST_COMPILE':
+       mode = 'FAST_RUN'
+    mode = compile.mode.get_mode(mode)
+    mode = mode.excluding('fusion').excluding('gpu')
+    v = T.vector()
+    val = numpy.arange(10,dtype=theano.config.floatX)
+    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
    f = function([v], v**(15), mode=mode)
    nodes = [node.op for node in f.maker.env.toposort()]
    assert len(nodes)==1