提交 9df496ab authored 作者: Michael I Mandel's avatar Michael I Mandel

merge

import numpy as np
import numexpr as ne
import timeit
import theano
import theano.tensor as T
import pylab
import matplotlib.pyplot as pyplot
def timeit_2vector_theano(init, nb_element=1e6, nb_repeat=3, nb_call=int(1e2), expr="a**2 + b**2 + 2*a*b"):
t3 = timeit.Timer("tf(av,bv)",
"""
import theano
import theano.tensor as T
import numexpr as ne
from theano.tensor import exp
%(init)s
av=a
bv=b
a=T.dvector()
b=T.dvector()
tf= theano.function([a,b],%(expr)s)
"""%locals()
)
ret=t3.repeat(nb_repeat,nb_call)
return np.asarray(ret)
def timeit_2vector(nb_element=1e6, nb_repeat=3, nb_call=int(1e2), expr="a**2 + b**2 + 2*a*b", do_unalign=False, do_amd=True):
print
print "timeit_2vector(nb_element=%(nb_element)s,nb_repeat=%(nb_repeat)s,nb_call=%(nb_call)s, expr=%(expr)s, do_unalign=%(do_unalign)s)"%locals()
if do_unalign:
init = "import numpy as np; a = np.empty(%(nb_element)s, dtype='b1,f8')['f1'];b = np.empty(%(nb_element)s, dtype='b1,f8')['f1'];a[:] = np.arange(len(a));b[:] = np.arange(len(b));"%locals()
else:
init = "import numpy as np; a = np.arange(%(nb_element)s);b = np.arange(%(nb_element)s)"%locals()
t1 = timeit.Timer("%(expr)s"%locals(),"from numpy import exp; %(init)s"%locals())
ret1=t1.repeat(nb_repeat,nb_call)
ret1=np.asarray(ret1)
print "NumPy time",ret1, ret1.min()
t2 = timeit.Timer("""ne.evaluate("%(expr)s")"""%locals(),
"import numexpr as ne; %(init)s"%locals())
ret2=t2.repeat(nb_repeat,nb_call)
ret2=np.asarray(ret2)
print "Numexpr time",ret2, ret2.min()
theano.config.lib.amdlibm = False
ret3 = timeit_2vector_theano(init, nb_element,nb_repeat,nb_call,expr)
print "Theano time",ret3, ret3.min()
if do_amd:
theano.config.lib.amdlibm = True
ret4 = timeit_2vector_theano(init, nb_element,nb_repeat,nb_call,expr)
print "Theano time(with amdlibm)",ret3, ret3.min()
print "Numexpr vs NumPy",ret1.min()/ret2.min()
print "Theano vs NumPy",ret1.min()/ret3.min()
print "Theano vs Numexpr",ret2.min()/ret3.min()
if do_amd:
print "Theano(amdlibm) vs NumPy",ret1.min()/ret4.min()
print "Theano(amdlibm) vs Numexpr",ret2.min()/ret4.min()
return ret1,ret2,ret3,ret4
return ret1,ret2,ret3
def exec_timeit_2vector(expr, nb_call_scal=1, fname=None, do_unalign=False, do_amd=True):
time=[]
#exp = [(1,100000),(1e1,100000),(1e2,100000),(1e3,100000), (5e3,50000),
exp = [(1e3,100000),(5e3,50000), \
(1e4,10000),(5e4,5000),(1e5,2000),(1e6,200),(1e7,10)
]
for nb_e, nb_c in exp:
time.append(timeit_2vector(nb_element=nb_e, nb_repeat=3, nb_call=nb_c*nb_call_scal, expr=expr, do_amd=do_amd))
if do_unalign:
time_unalign=[]
for nb_e, nb_c in exp:
time_unalign.append(timeit_2vector(nb_element=nb_e, nb_repeat=3, nb_call=nb_c*nb_call_scal, expr=expr, do_unalign=True, do_amd=do_amd))
print time
num_speedup = np.asarray([t[0].min()/t[1].min() for t in time],"float32")
print "Numexpr vs NumPy",num_speedup,num_speedup.min(),num_speedup.max()
theano_speedup = np.asarray([t[0].min()/t[2].min() for t in time],"float32")
print "Theano vs NumPy",theano_speedup,theano_speedup.min(),theano_speedup.max()
theano_num_speedup = np.asarray([t[1].min()/t[2].min() for t in time],"float32")
print "Theano vs Numexpr",theano_num_speedup,theano_num_speedup.min(),theano_num_speedup.max()
if do_amd:
theano_speedup2 = np.asarray([t[0].min()/t[3].min() for t in time],"float32")
print "Theano vs NumPy",theano_speedup,theano_speedup.min(),theano_speedup.max()
theano_num_speedup2 = np.asarray([t[1].min()/t[3].min() for t in time],"float32")
print "Theano vs Numexpr",theano_num_speedup,theano_num_speedup.min(),theano_num_speedup.max()
nb_calls=[e[0] for e in exp]
for cmp in range(1,len(time[0])):
speedup = np.asarray([t[0].min()/t[cmp].min() for t in time],"float32")
pylab.semilogx(nb_calls, speedup, linewidth=1.0)
if do_unalign:
for cmp in range(1,len(time[0])):
speedup = np.asarray([t[0].min()/t[cmp].min() for t in time_unalign],"float32")
pylab.semilogx(nb_calls, speedup, linewidth=1.0)
pylab.axhline(y=1, linewidth=1.0, color='black')
pylab.xlabel('Dimension of real valued vectors a and b')
pylab.ylabel('Speed up vs NumPy')
if do_unalign and do_amd:
pylab.legend(("Numexpr","Theano","Theano(amdlibm)", "Numexpr(unalign)",
"Theano(unalign)","Theano(amdlibm,unalign)"),loc='upper left')
elif do_unalign and not do_amd:
pylab.legend(("Numexpr","Theano","Numexpr(unalign)",
"Theano(unalign)",),loc='upper left')
elif not do_unalign and do_amd:
pylab.legend(("Numexpr","Theano","Theano(amdlibm)"),loc='upper left')
else:
pylab.legend(("Numexpr","Theano"),loc='upper left')
pylab.grid(True)
if fname:
pylab.savefig(fname)
pylab.clf()
else:
pylab.show()
def execs_timeit_2vector(exprs, fname=None):
"""
exprs is a list of list of expr to evaluate
The first level of list is put into different graph section in the same graph.
The second level is the expression to put in each section
"""
#exp = [(1,100000),(1e1,100000),(1e2,100000),(1e3,100000), (5e3,50000),
exp = [(1e3,100000),(5e3,50000), \
(1e4,10000),(5e4,5000),(1e5,2000),(1e6,200),(1e7,10)
]
#TO TEST UNCOMMENT THIS LINE
#exp = [(1,1000),(1e1,1000),(1e2,1000),]
times=[]
str_expr=[]
for g_exprs in exprs:
for expr in g_exprs:
nb_call_scal=1
if isinstance(expr,tuple):
nb_call_scal=expr[1]
expr = expr[0]
str_expr.append(expr)
time=[]
for nb_e, nb_c in exp:
time.append(timeit_2vector(nb_element=nb_e, nb_repeat=3, nb_call=nb_c*nb_call_scal, expr=expr, do_amd=False))
times.append(time)
nb_calls=[e[0] for e in exp]
legends=[]
colors=['b','r','g','c', 'm', 'y']
assert len(colors)>=len(times)
fig = pylab.figure()
for idx,(time,expr) in enumerate(zip(times,str_expr)):
pylab.subplot(220+idx+1)
pylab.subplots_adjust(wspace=0.25, hspace=0.25)
#legend=[]
#plot = fig.add_subplot(1,len(exprs),idx)
speedup = [t[0].min()/t[1].min() for t in time]
pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r')
speedup = [t[0].min()/t[2].min() for t in time]
pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b')
pylab.grid(True)
if (idx == 2) or (idx == 3):
pylab.xlabel('Dimension of vectors a and b')
if (idx == 0) or (idx == 2):
pylab.ylabel('Speed up vs NumPy')
pylab.axhline(y=1, linewidth=1.0, color='black')
pylab.xlim(1e3,1e7)
pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7'])
pylab.title(expr)
#for time,expr,color in zip(times,str_expr,colors):
# speedup = [t[0].min()/t[1].min() for t in time]
# plot.semilogx(nb_calls, speedup, linewidth=1.0, linestyle='--', color=color)
# speedup = [t[0].min()/t[2].min() for t in time]
# plot.semilogx(nb_calls, speedup, linewidth=1.0, color=color)
#legend += ["Numexpr "+expr,"Theano "+expr]
#pylab.title('Speed up Numexpr and Theano vs NumPy')
#pylab.grid(True)
#pylab.xlabel('Nb element')
#pylab.ylabel('Speed up vs NumPy')
#pylab.legend(legend,loc='upper left')
# fig.legend(legend,loc='upper left')
if fname:
fig.savefig(fname)
pylab.clf()
else:
pylab.show()
execs_timeit_2vector([
["a**2 + b**2 + 2*a*b",
"2*a + 3*b",
"a+1",],
[("2*a + b**10",.2)]
#"2*a + b*b*b*b*b*b*b*b*b*b",
#("2*a + exp(b)",.3),
],fname="multiple_graph.pdf"
)
###
### This case is the one gived on numexpr web site(http://code.google.com/p/numexpr/) as of 16 June 2010
### a**2 + b**2 + 2*a*b
#exec_timeit_2vector("a**2 + b**2 + 2*a*b",fname="speedup_numexpr_mulpow2vec.png", do_amd=False)
###
### This case is the one gived on numexpr web site(http://code.google.com/p/numexpr/wiki/Overview) as of 16 June 2010
### 2*a + 3*b
#exec_timeit_2vector("2*a + 3*b",fname="speedup_numexpr_mul2vec.png", do_amd=False)
###
### This case is the one gived on numexpr web site(http://code.google.com/p/numexpr/wiki/Overview) as of 16 June 2010
### 2*a + b**10
#exec_timeit_2vector("2*a + b**10",.2,fname="speedup_numexpr_mulpow2vec_simple.png")
#exec_timeit_2vector("2*a + b*b*b*b*b*b*b*b*b*b",fname="speedup_numexpr_mulpow2vec_simpleV2.png", do_amd=False)
###
### We try to see if the pow optimized speed is available for exp too.
### 2*a + exp(b)
#exec_timeit_2vector("2*a + exp(b)",.3,fname="speedup_numexpr_mulexp2vec.png")
###
### The simplest case where we should show the overhead at its maximum effect
### a+1
#exec_timeit_2vector("a+1",fname="speedup_numexpr_add1vec.png")
#exec_timeit_2vector("a+1",.2,fname="speedup_numexpr_add1vec_unalign.png",do_unalign=True, do_amd=False)
#exec_timeit_2vector("2*a + b**10",.1,fname="speedup_numexpr_mulpow2vec_simple_unalign.png",do_unalign=True)
...@@ -6,17 +6,18 @@ from neighbours import images2neibs, neibs2images ...@@ -6,17 +6,18 @@ from neighbours import images2neibs, neibs2images
def neibs_test(): def neibs_test():
images = shared(arange(2*2*4*4, dtype='float32').reshape(2,2,4,4)) shape = (100,40,18,18)
images = shared(arange(prod(shape), dtype='float32').reshape(shape))
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32')) neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
f = function([], images2neibs(images, neib_shape)) f = function([], images2neibs(images, neib_shape))
print images.value #print images.value
neibs = f() neibs = f()
print neibs #print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape)) g = function([], neibs2images(neibs, neib_shape, images.shape))
print g() #print g()
assert allclose(images.value,g()) assert allclose(images.value,g())
neibs_test() neibs_test()
...@@ -1828,6 +1828,33 @@ def local_pow_specialize(node): ...@@ -1828,6 +1828,33 @@ def local_pow_specialize(node):
if N.all(y == -2): if N.all(y == -2):
rval = [T.inv(T.sqr(xsym))] rval = [T.inv(T.sqr(xsym))]
# Optimize all integral powers in [-RANGE, RANGE]
if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
pow2 = [xsym]
pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
y_to_do = abs(y)
for i in range(int(numpy.log2(y_to_do))):
pow2.append(T.sqr(pow2[i]))
pow2_scal.append(theano.scalar.sqr(pow2_scal[i]))
rval1 = None
rval1_scal = None
while y_to_do>0:
log_to_do = int(numpy.log2(y_to_do))
if rval1:
rval1 *= pow2[log_to_do]
rval1_scal *= pow2_scal[log_to_do]
else:
rval1 = pow2[log_to_do]
rval1_scal = pow2_scal[log_to_do]
y_to_do -= 2**log_to_do
if abs(y)>2:
#We fuse all the pow together here to make compilation faster
rval1 = Elemwise(theano.scalar.Composite([pow2_scal[0]],[rval1_scal])).make_node(xsym)
if y<0:
rval = [T.inv(rval1)]
else:
rval = [rval1]
if rval: if rval:
rval[0] = T.cast(rval[0], odtype) rval[0] = T.cast(rval[0], odtype)
assert rval[0].type == node.outputs[0].type, (rval, node.outputs) assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
...@@ -1835,6 +1862,10 @@ def local_pow_specialize(node): ...@@ -1835,6 +1862,10 @@ def local_pow_specialize(node):
else: else:
return False return False
register_specialize(local_pow_specialize) register_specialize(local_pow_specialize)
theano.configparser.AddConfigVar('experimental.pow',
"Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
theano.configparser.BoolParam(False),
)
@gof.local_optimizer([T.mul]) @gof.local_optimizer([T.mul])
def local_mul_specialize(node): def local_mul_specialize(node):
......
## PENDING REWRITE OF tensor_opt.py ## PENDING REWRITE OF tensor_opt.py
import time
import numpy import numpy
import theano import theano
...@@ -13,7 +14,7 @@ from theano.gof import Env ...@@ -13,7 +14,7 @@ from theano.gof import Env
from theano.tensor.elemwise import DimShuffle from theano.tensor.elemwise import DimShuffle
from theano import pprint, shared from theano import pprint, shared
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
#import scalar_opt import scalar as scal
from theano import function, compile from theano import function, compile
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
...@@ -680,7 +681,6 @@ class test_fusion(unittest.TestCase): ...@@ -680,7 +681,6 @@ class test_fusion(unittest.TestCase):
] ]
if slice: if slice:
cases = cases[slice] cases = cases[slice]
import time
times=numpy.zeros(len(cases)) times=numpy.zeros(len(cases))
fail1=[] fail1=[]
fail2=[] fail2=[]
...@@ -1216,6 +1216,119 @@ def test_local_mul_specialize(): ...@@ -1216,6 +1216,119 @@ def test_local_mul_specialize():
assert nodes == [T.mul] assert nodes == [T.mul]
def speed_local_pow_specialize_range():
val = numpy.random.rand(1e7)
v = T.vector()
mode = compile.mode.get_default_mode()
mode_without_pow_opt = mode.excluding('local_pow_specialize')
for i in range(500,513):
f1 = function([v], v**i, mode=mode)
f2 = function([v], v**i, mode=mode_without_pow_opt)
assert len(f1.maker.env.toposort())==1
t1=time.time()
f1(val)
t2=time.time()
f2(val)
t3=time.time()
print i,t2-t1,t3-t2,t2-t1<t3-t2
if not t2-t1<t3-t2:
print "WARNING WE ARE SLOWER"
for i in range(-3,-1500,-1):
f1 = function([v], v**i, mode=mode)
f2 = function([v], v**i, mode=mode_without_pow_opt)
assert len(f1.maker.env.toposort())==1
t1=time.time()
f1(val)
t2=time.time()
f2(val)
t3=time.time()
print i,t2-t1,t3-t2,t2-t1<t3-t2
if not t2-t1<t3-t2:
print "WARNING WE ARE SLOWER"
def test_local_pow_specialize():
# test a few cases to make sure that the basics are covered
#
mode = theano.config.mode
if mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
mode = compile.mode.get_mode(mode)
mode = mode.excluding('fusion')
v = T.vector()
val = numpy.arange(10,dtype=theano.config.floatX)
val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
f = function([v], v**0, mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [Shape_i(0), T.alloc]
assert numpy.allclose(f(val),val**0)
f = function([v], v**1, mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == []
assert numpy.allclose(f(val),val**1)
f = function([v], v**(-1), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [T.inv]
assert numpy.allclose(f(val_no0),val_no0**(-1))
f = function([v], v**2, mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [T.sqr]
assert numpy.allclose(f(val),val**2)
f = function([v], v**(-2), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==2
assert nodes[0] == T.sqr
assert isinstance(nodes[1].scalar_op,theano.scalar.basic.Inv)
# assert nodes == [T.sqr,T.inv]#Why this don't work?
assert numpy.allclose(f(val_no0),val_no0**(-2))
f = function([v], v**(.5), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert nodes == [T.sqrt]
assert numpy.allclose(f(val),val**(.5))
f = function([v], v**(-.5), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==2
assert nodes[0] == T.sqrt
assert isinstance(nodes[1].scalar_op,theano.scalar.basic.Inv)
# assert nodes == [T.sqrt,T.inv]#Why this don't work?
assert numpy.allclose(f(val_no0),val_no0**(-.5))
if config.experimental.pow:
print "Test experimental.pow=True"
f = function([v], v**(15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==1
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**15)
f = function([v], v**(-15), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes)==2
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-15))
f = function([v], v**(16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 1
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert numpy.allclose(f(val),val**16)
f = function([v], v**(-16), mode=mode)
nodes = [node.op for node in f.maker.env.toposort()]
assert len(nodes) == 2
assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
assert numpy.allclose(f(val_no0),val_no0**(-16))
class T_Rebroadcast(unittest.TestCase): class T_Rebroadcast(unittest.TestCase):
def test_local_useless_rebroadcast(self): def test_local_useless_rebroadcast(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论