merge

9df496ab · Michael I Mandel · c7ca47f5 · c621473e · 9df496ab · 9df496ab
--- a/benchmark/numexpr/gen_graph.py
+++ b/benchmark/numexpr/gen_graph.py
+import numpy as np
+import numexpr as ne
+import timeit
+import theano
+import theano.tensor as T
+import pylab
+import matplotlib.pyplot  as pyplot
+def timeit_2vector_theano(init, nb_element=1e6, nb_repeat=3, nb_call=int(1e2), expr="a**2 + b**2 + 2*a*b"):
+    t3 = timeit.Timer("tf(av,bv)",
+                      """
+import theano
+import theano.tensor as T
+import numexpr as ne
+from theano.tensor import exp
+%(init)s
+av=a
+bv=b
+a=T.dvector()
+b=T.dvector()
+tf= theano.function([a,b],%(expr)s)
+"""%locals()
+)
+    ret=t3.repeat(nb_repeat,nb_call)
+    return np.asarray(ret)
+def timeit_2vector(nb_element=1e6, nb_repeat=3, nb_call=int(1e2), expr="a**2 + b**2 + 2*a*b", do_unalign=False, do_amd=True):
+    print
+    print "timeit_2vector(nb_element=%(nb_element)s,nb_repeat=%(nb_repeat)s,nb_call=%(nb_call)s, expr=%(expr)s, do_unalign=%(do_unalign)s)"%locals()
+    if do_unalign:
+        init = "import numpy as np; a = np.empty(%(nb_element)s, dtype='b1,f8')['f1'];b = np.empty(%(nb_element)s, dtype='b1,f8')['f1'];a[:] = np.arange(len(a));b[:] = np.arange(len(b));"%locals()
+    else:
+        init = "import numpy as np; a = np.arange(%(nb_element)s);b = np.arange(%(nb_element)s)"%locals()
+    t1 = timeit.Timer("%(expr)s"%locals(),"from numpy import exp; %(init)s"%locals())
+    ret1=t1.repeat(nb_repeat,nb_call)
+    ret1=np.asarray(ret1)
+    print "NumPy time",ret1, ret1.min()
+    t2 = timeit.Timer("""ne.evaluate("%(expr)s")"""%locals(),
+                      "import numexpr as ne; %(init)s"%locals())
+    ret2=t2.repeat(nb_repeat,nb_call)
+    ret2=np.asarray(ret2)
+    print "Numexpr time",ret2, ret2.min()
+    theano.config.lib.amdlibm = False
+    ret3 = timeit_2vector_theano(init, nb_element,nb_repeat,nb_call,expr)
+    print "Theano time",ret3, ret3.min()
+    if do_amd:
+        theano.config.lib.amdlibm = True
+        ret4 = timeit_2vector_theano(init, nb_element,nb_repeat,nb_call,expr)
+        print "Theano time(with amdlibm)",ret3, ret3.min()
+    print "Numexpr vs NumPy",ret1.min()/ret2.min()
+    print "Theano vs NumPy",ret1.min()/ret3.min()
+    print "Theano vs Numexpr",ret2.min()/ret3.min()
+    if do_amd:
+        print "Theano(amdlibm) vs NumPy",ret1.min()/ret4.min()
+        print "Theano(amdlibm) vs Numexpr",ret2.min()/ret4.min()
+        return ret1,ret2,ret3,ret4
+    return ret1,ret2,ret3
+def exec_timeit_2vector(expr, nb_call_scal=1, fname=None, do_unalign=False, do_amd=True):
+    time=[]
+    #exp = [(1,100000),(1e1,100000),(1e2,100000),(1e3,100000), (5e3,50000),
+    exp = [(1e3,100000),(5e3,50000), \
+           (1e4,10000),(5e4,5000),(1e5,2000),(1e6,200),(1e7,10)
+           ]
+    for nb_e, nb_c in exp:
+        time.append(timeit_2vector(nb_element=nb_e, nb_repeat=3, nb_call=nb_c*nb_call_scal, expr=expr, do_amd=do_amd))
+    if do_unalign:
+        time_unalign=[]
+        for nb_e, nb_c in exp:
+            time_unalign.append(timeit_2vector(nb_element=nb_e, nb_repeat=3, nb_call=nb_c*nb_call_scal, expr=expr, do_unalign=True, do_amd=do_amd))
+    print time
+    num_speedup = np.asarray([t[0].min()/t[1].min() for t in time],"float32")
+    print "Numexpr vs NumPy",num_speedup,num_speedup.min(),num_speedup.max()
+    theano_speedup = np.asarray([t[0].min()/t[2].min() for t in time],"float32")
+    print "Theano vs NumPy",theano_speedup,theano_speedup.min(),theano_speedup.max()
+    theano_num_speedup = np.asarray([t[1].min()/t[2].min() for t in time],"float32")
+    print "Theano vs Numexpr",theano_num_speedup,theano_num_speedup.min(),theano_num_speedup.max()
+    if do_amd:
+        theano_speedup2 = np.asarray([t[0].min()/t[3].min() for t in time],"float32")
+        print "Theano vs NumPy",theano_speedup,theano_speedup.min(),theano_speedup.max()
+        theano_num_speedup2 = np.asarray([t[1].min()/t[3].min() for t in time],"float32")
+        print "Theano vs Numexpr",theano_num_speedup,theano_num_speedup.min(),theano_num_speedup.max()
+    nb_calls=[e[0] for e in exp]
+    for cmp in range(1,len(time[0])):
+        speedup = np.asarray([t[0].min()/t[cmp].min() for t in time],"float32")
+        pylab.semilogx(nb_calls, speedup, linewidth=1.0)
+    if do_unalign:
+        for cmp in range(1,len(time[0])):
+            speedup = np.asarray([t[0].min()/t[cmp].min() for t in time_unalign],"float32")
+            pylab.semilogx(nb_calls, speedup, linewidth=1.0)
+    pylab.axhline(y=1, linewidth=1.0, color='black')
+    pylab.xlabel('Dimension of real valued vectors a and b')
+    pylab.ylabel('Speed up vs NumPy')
+    if do_unalign and do_amd:
+        pylab.legend(("Numexpr","Theano","Theano(amdlibm)", "Numexpr(unalign)",
+                      "Theano(unalign)","Theano(amdlibm,unalign)"),loc='upper left')
+    elif do_unalign and not do_amd:
+        pylab.legend(("Numexpr","Theano","Numexpr(unalign)",
+                      "Theano(unalign)",),loc='upper left')
+    elif not do_unalign and do_amd:
+        pylab.legend(("Numexpr","Theano","Theano(amdlibm)"),loc='upper left')
+    else:
+        pylab.legend(("Numexpr","Theano"),loc='upper left')
+    pylab.grid(True)
+    if fname:
+        pylab.savefig(fname)
+        pylab.clf()
+    else:
+        pylab.show()
+def execs_timeit_2vector(exprs, fname=None):
+    """
+    exprs is a list of list of expr to evaluate
+    The first level of list is put into different graph section in the same graph.
+    The second level is the expression to put in each section
+    """
+    #exp = [(1,100000),(1e1,100000),(1e2,100000),(1e3,100000), (5e3,50000),
+    exp = [(1e3,100000),(5e3,50000), \
+           (1e4,10000),(5e4,5000),(1e5,2000),(1e6,200),(1e7,10)
+           ]
+#TO TEST UNCOMMENT THIS LINE
+    #exp = [(1,1000),(1e1,1000),(1e2,1000),]
+    times=[]
+    str_expr=[]
+    for g_exprs in exprs:
+        for expr in g_exprs:
+            nb_call_scal=1
+            if isinstance(expr,tuple):
+                nb_call_scal=expr[1]
+                expr = expr[0]
+            str_expr.append(expr)
+            time=[]
+            for nb_e, nb_c in exp:
+                time.append(timeit_2vector(nb_element=nb_e, nb_repeat=3, nb_call=nb_c*nb_call_scal, expr=expr, do_amd=False))
+            times.append(time)
+    nb_calls=[e[0] for e in exp]
+    legends=[]
+    colors=['b','r','g','c', 'm', 'y']
+    assert len(colors)>=len(times)
+    fig = pylab.figure()
+    for idx,(time,expr) in enumerate(zip(times,str_expr)):
+        pylab.subplot(220+idx+1)
+        pylab.subplots_adjust(wspace=0.25, hspace=0.25)
+        #legend=[]
+        #plot = fig.add_subplot(1,len(exprs),idx)
+        speedup = [t[0].min()/t[1].min() for t in time]
+        pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r')
+        speedup = [t[0].min()/t[2].min() for t in time]
+        pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b')
+        pylab.grid(True)
+        if (idx == 2) or (idx == 3):
+            pylab.xlabel('Dimension of vectors a and b')
+        if (idx == 0) or (idx == 2):
+            pylab.ylabel('Speed up vs NumPy')
+        pylab.axhline(y=1, linewidth=1.0, color='black')
+        pylab.xlim(1e3,1e7)
+        pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7'])
+        pylab.title(expr)
+        #for time,expr,color in zip(times,str_expr,colors):
+        #    speedup = [t[0].min()/t[1].min() for t in time]
+        #    plot.semilogx(nb_calls, speedup, linewidth=1.0, linestyle='--', color=color)
+        #    speedup = [t[0].min()/t[2].min() for t in time]
+        #    plot.semilogx(nb_calls, speedup, linewidth=1.0, color=color)
+            #legend += ["Numexpr "+expr,"Theano "+expr]
+    #pylab.title('Speed up Numexpr and Theano vs NumPy')
+    #pylab.grid(True)
+    #pylab.xlabel('Nb element')
+    #pylab.ylabel('Speed up vs NumPy')
+    #pylab.legend(legend,loc='upper left')
+#    fig.legend(legend,loc='upper left')
+    if fname:
+        fig.savefig(fname)
+        pylab.clf()
+    else:
+        pylab.show()
+execs_timeit_2vector([
+        ["a**2 + b**2 + 2*a*b",
+         "2*a + 3*b",
+         "a+1",],
+        [("2*a + b**10",.2)]
+#"2*a + b*b*b*b*b*b*b*b*b*b",
+#("2*a + exp(b)",.3),
+],fname="multiple_graph.pdf"
+)
+###
+### This case is the one gived on numexpr web site(http://code.google.com/p/numexpr/) as of 16 June 2010
+### a**2 + b**2 + 2*a*b
+#exec_timeit_2vector("a**2 + b**2 + 2*a*b",fname="speedup_numexpr_mulpow2vec.png", do_amd=False)
+###
+### This case is the one gived on numexpr web site(http://code.google.com/p/numexpr/wiki/Overview) as of 16 June 2010
+### 2*a + 3*b
+#exec_timeit_2vector("2*a + 3*b",fname="speedup_numexpr_mul2vec.png", do_amd=False)
+###
+### This case is the one gived on numexpr web site(http://code.google.com/p/numexpr/wiki/Overview) as of 16 June 2010
+### 2*a + b**10
+#exec_timeit_2vector("2*a + b**10",.2,fname="speedup_numexpr_mulpow2vec_simple.png")
+#exec_timeit_2vector("2*a + b*b*b*b*b*b*b*b*b*b",fname="speedup_numexpr_mulpow2vec_simpleV2.png", do_amd=False)
+###
+### We try to see if the pow optimized speed is available for exp too.
+### 2*a + exp(b)
+#exec_timeit_2vector("2*a + exp(b)",.3,fname="speedup_numexpr_mulexp2vec.png")
+###
+### The simplest case where we should show the overhead at its maximum effect
+### a+1
+#exec_timeit_2vector("a+1",fname="speedup_numexpr_add1vec.png")
+#exec_timeit_2vector("a+1",.2,fname="speedup_numexpr_add1vec_unalign.png",do_unalign=True, do_amd=False)
+#exec_timeit_2vector("2*a + b**10",.1,fname="speedup_numexpr_mulpow2vec_simple_unalign.png",do_unalign=True)
--- a/benchmark/numexpr/multiple_graph.png
+++ b/benchmark/numexpr/multiple_graph.png
--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -6,17 +6,18 @@ from neighbours import images2neibs, neibs2images
 def neibs_test():
-    images = shared(arange(2*2*4*4, dtype='float32').reshape(2,2,4,4))
+    shape = (100,40,18,18)
+    images = shared(arange(prod(shape), dtype='float32').reshape(shape))
    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
    f = function([], images2neibs(images, neib_shape))
-    print images.value
+    #print images.value
    neibs = f()
-    print neibs
+    #print neibs
    g = function([], neibs2images(neibs, neib_shape, images.shape))
-    print g()
+    #print g()
    assert allclose(images.value,g())
 neibs_test()
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1828,6 +1828,33 @@ def local_pow_specialize(node):
            if N.all(y == -2):
                rval = [T.inv(T.sqr(xsym))]
+            # Optimize all integral powers in [-RANGE, RANGE]
+            if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
+                pow2 = [xsym]
+                pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
+                y_to_do = abs(y)
+                for i in range(int(numpy.log2(y_to_do))):
+                    pow2.append(T.sqr(pow2[i]))
+                    pow2_scal.append(theano.scalar.sqr(pow2_scal[i]))
+                rval1 = None
+                rval1_scal = None
+                while y_to_do>0:
+                    log_to_do = int(numpy.log2(y_to_do))                    
+                    if rval1:
+                        rval1 *= pow2[log_to_do]
+                        rval1_scal *= pow2_scal[log_to_do]
+                    else: 
+                        rval1 = pow2[log_to_do]
+                        rval1_scal = pow2_scal[log_to_do]
+                    y_to_do -= 2**log_to_do
+                if abs(y)>2:
+                    #We fuse all the pow together here to make compilation faster
+                    rval1 = Elemwise(theano.scalar.Composite([pow2_scal[0]],[rval1_scal])).make_node(xsym)
+                if y<0:
+                    rval = [T.inv(rval1)]
+                else:
+                    rval = [rval1]
            if rval:
                rval[0] = T.cast(rval[0], odtype)
                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
@@ -1835,6 +1862,10 @@ def local_pow_specialize(node):
    else:
        return False
 register_specialize(local_pow_specialize)
+theano.configparser.AddConfigVar('experimental.pow',
+        "Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
+        theano.configparser.BoolParam(False),
+        )
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
 ## PENDING REWRITE OF tensor_opt.py
+import time
 import numpy
 import theano
@@ -13,7 +14,7 @@ from theano.gof import Env
 from theano.tensor.elemwise import DimShuffle
 from theano import pprint, shared
 from theano.tests import unittest_tools as utt
-#import scalar_opt
+import scalar as scal
 from theano import function, compile
 from nose.plugins.skip import SkipTest
@@ -680,7 +681,6 @@ class test_fusion(unittest.TestCase):
            ]
        if slice:
            cases = cases[slice]
-        import time
        times=numpy.zeros(len(cases))
        fail1=[]
        fail2=[]
@@ -1216,6 +1216,119 @@ def test_local_mul_specialize():
    assert nodes == [T.mul]
+def speed_local_pow_specialize_range():
+    val = numpy.random.rand(1e7)
+    v = T.vector()
+    mode = compile.mode.get_default_mode()
+    mode_without_pow_opt = mode.excluding('local_pow_specialize')
+    for i in range(500,513):
+        f1 = function([v], v**i, mode=mode)
+        f2 = function([v], v**i, mode=mode_without_pow_opt)
+        assert len(f1.maker.env.toposort())==1
+        t1=time.time()
+        f1(val)
+        t2=time.time()
+        f2(val)
+        t3=time.time()
+        print i,t2-t1,t3-t2,t2-t1<t3-t2
+        if not t2-t1<t3-t2:
+            print "WARNING WE ARE SLOWER"
+    for i in range(-3,-1500,-1):
+        f1 = function([v], v**i, mode=mode)
+        f2 = function([v], v**i, mode=mode_without_pow_opt)
+        assert len(f1.maker.env.toposort())==1
+        t1=time.time()
+        f1(val)
+        t2=time.time()
+        f2(val)
+        t3=time.time()
+        print i,t2-t1,t3-t2,t2-t1<t3-t2
+        if not t2-t1<t3-t2:
+            print "WARNING WE ARE SLOWER"
+def test_local_pow_specialize():
+    # test a few cases to make sure that the basics are covered
+    # 
+    mode = theano.config.mode
+    if mode == 'FAST_COMPILE':
+       mode = 'FAST_RUN'
+    mode = compile.mode.get_mode(mode)
+    mode = mode.excluding('fusion')
+    v = T.vector()
+    val = numpy.arange(10,dtype=theano.config.floatX)
+    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
+    f = function([v], v**0, mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert nodes == [Shape_i(0), T.alloc]
+    assert numpy.allclose(f(val),val**0)
+    f = function([v], v**1, mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert nodes == []
+    assert numpy.allclose(f(val),val**1)
+    f = function([v], v**(-1), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert nodes == [T.inv]
+    assert numpy.allclose(f(val_no0),val_no0**(-1))
+    f = function([v], v**2, mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert nodes == [T.sqr]
+    assert numpy.allclose(f(val),val**2)
+    f = function([v], v**(-2), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert len(nodes)==2
+    assert nodes[0] == T.sqr
+    assert isinstance(nodes[1].scalar_op,theano.scalar.basic.Inv)
+#    assert nodes == [T.sqr,T.inv]#Why this don't work?
+    assert numpy.allclose(f(val_no0),val_no0**(-2))
+    f = function([v], v**(.5), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert nodes == [T.sqrt]
+    assert numpy.allclose(f(val),val**(.5))
+    f = function([v], v**(-.5), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert len(nodes)==2
+    assert nodes[0] == T.sqrt
+    assert isinstance(nodes[1].scalar_op,theano.scalar.basic.Inv)
+#    assert nodes == [T.sqrt,T.inv]#Why this don't work?
+    assert numpy.allclose(f(val_no0),val_no0**(-.5))
+    if config.experimental.pow:
+        print "Test experimental.pow=True"
+        f = function([v], v**(15), mode=mode)
+        nodes = [node.op for node in f.maker.env.toposort()]
+        assert len(nodes)==1
+        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+        assert numpy.allclose(f(val),val**15)
+        f = function([v], v**(-15), mode=mode)
+        nodes = [node.op for node in f.maker.env.toposort()]
+        assert len(nodes)==2
+        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+        assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
+        assert numpy.allclose(f(val_no0),val_no0**(-15))
+        f = function([v], v**(16), mode=mode)
+        nodes = [node.op for node in f.maker.env.toposort()]
+        assert len(nodes) == 1
+        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+        assert numpy.allclose(f(val),val**16)
+        f = function([v], v**(-16), mode=mode)
+        nodes = [node.op for node in f.maker.env.toposort()]
+        assert len(nodes) == 2
+        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+        assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
+        assert numpy.allclose(f(val_no0),val_no0**(-16))
 class T_Rebroadcast(unittest.TestCase):
    def test_local_useless_rebroadcast(self):