提交 9dd4459a authored 作者: Frederic Bastien's avatar Frederic Bastien

added Skipped test for the elemwise fusion that is disabled.

上级 160eacf8
......@@ -790,7 +790,201 @@ def test_const_type_in_mul_canonizer():
f2(ival, wval, visbval, hidbval, betaval, aval),
f1(ival, wval, visbval, hidbval, betaval, aval))
from theano.compile.sandbox.pfunc import pfunc
from theano.compile.sandbox.sharedvalue import shared
import theano_cuda_ndarray as tcn
import theano
class test_fusion(unittest.TestCase):
def do(self, mode, shared_fn, shp, gpu=False, nb_repeat=1, assert_len_topo=True, slice=None):
"""
param shared_fn: if None, will use compile.function
verify that the elemwise fusion work
Test with and without DimShuffle
"""
#TODO: disable the canonizer?
def my_init(shp,dtype, num=0):
#ret = numpy.asarray(numpy.random.rand(*shp),dtype=dtype)
ret = numpy.zeros(shp, dtype=dtype)+num
return ret
fw, fx, fy, fz = fmatrices('wxyz')
dw, dx, dy, dz = dmatrices('wxyz')
fv = fvector('r').dimshuffle('x',0)
dv = dvector('s').dimshuffle('x',0)
fwv = my_init(shp,'float32',1)
fxv = my_init(shp,'float32',2)
fyv = my_init(shp,'float32',3)
fzv = my_init(shp,'float32',4)
fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
dwv = my_init(shp,'float64',5)
# dxv = my_init(shp,'float64',6)
# dyv = my_init(shp,'float64',7)
# dzv = my_init(shp,'float64',8)
# dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
fwx=fw+fx
cases = [
(fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),
(fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),
(fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
(fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
(fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
(fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'),
(fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'),
(fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'),
(fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),
(fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),
#test with constant
((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
(2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
#mix float32 and float64
(2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
(2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
(2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),
(2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),
#test when their is other op then elemwise.
#the good output for the next test.
# (Pdb) p f.maker.env.toposort()
#[Elemwise{add,no_inplace}(w, x), Sum(Elemwise{add,no_inplace}.0), InplaceDimShuffle{x,x}(Sum.0), Elemwise{Composite{_impls=[<function <lambda> at 0x2c5c8c0>], nin=4, _c_code={
#npy_float32 V%(id)s_tmp1;
#V%(id)s_tmp1 = %(i2)s + %(i3)s;
#npy_float32 V%(id)s_tmp2;
#V%(id)s_tmp2 = %(i0)s + %(i1)s;
#%(o0)s = V%(id)s_tmp2 + V%(id)s_tmp1;
#}
#, nout=1, env=[add(add(<float32>, <float32>), add(<float32>, <float32>))]}}(InplaceDimShuffle{x,x}.0, Elemwise{add,no_inplace}.0, y, z)]
((fwx.sum())+(fwx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),4,(fwv+fxv).sum()+fwv+fxv+fyv+fzv,'float32'),
#26 elem before this line
#test other elemwise op
(fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'),
(fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'),
(fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),
(fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'),
(fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'),
(fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'),
(fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow
(fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),
(fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
(fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
# (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),#TODO: c_code not implemented for %
(fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
(fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
(fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),
(fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),
# (fx-(fy==fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),#TODO: bugged
(fx-(fy!=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
(fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),
(fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),
(fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'),
(fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'),
(fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'),
(fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
(fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),
(fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
# (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
#TODO: BIT OP only with ints, xor, or, and, invert
# (fx-theano.tensor.or_(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy|fz),'float32'),
# (fx-theano.tensor.xor(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy^fz),'float32'),
]
if slice:
cases = cases[slice]
import time
times=numpy.zeros(len(cases))
for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
print "new cases", id
if shared_fn == None:
f = compile.function(list(sym_inputs), g,mode=mode)
#pre-call to have the data in cache if it fit to don't penalise the first iteration
# if id==0:
# out=f(*val_inputs)
t0=time.time()
for x in range(nb_repeat):
out=f(*val_inputs)
t1=time.time()
nb_repeat=1
else:
out=shared_fn(numpy.zeros(shp, dtype=out_dtype),'out')
f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
#pre-call to have the data in cache if it fit to don't penalise the first iteration
# if id==0:
# f(*val_inputs)
t0=time.time()
for x in range(nb_repeat):
f(*val_inputs)
t1=time.time()
out=out.value
# if id==0:
# nb_repeat+=1
times[id]=t1-t0
assert numpy.allclose(out,answer*nb_repeat,atol=1e-6 if out_dtype=='float32' else 1e-8)
topo=f.maker.env.toposort()
if gpu:
topo = [x for x in topo if not isinstance(x.op,tcn.basic_ops.GpuFromHost)]
gpu_ = [x for x in topo if isinstance(x.op,tcn.basic_ops.GpuFromHost)]
assert len(gpu_)==len(sym_inputs)
if assert_len_topo:
assert(len(topo)==nb_elemwise)
assert(out_dtype==out.dtype)
print "Executed",len(cases),"cases"
return times
def test_elemwise_fusion(self):
raise SkipTest("Current implementation of test_fusion is not enabled. So we skip the corresponding test")
shp=(5,5)
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE']
self.do(mode, shared, shp)
def gpu_fusion(self):
shp=(5,5)
#we need the optimisation enabled, debug do this.
mode=compile.mode.predefined_modes['FAST_COMPILE']
mode=compile.mode.predefined_modes['FAST_RUN']
mode=compile.mode.predefined_modes['DEBUG_MODE']
self.do(mode, tcn.shared_constructor, shp, gpu=True)
def speed_fusion(self):
import copy
shp=(3000,3000)
#mode1=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
mode1=compile.Mode(gof.CLinker(), copy.copy(compile.mode.OPT_FAST_RUN))
#TODO:clinker is much faster... but use to much memory
#Possible cause: as their is do deletion of intermediate value when we don't keep the fct.
#More plausible cause: we keep a link to the output data?
#Follow up. Clinker do the same... second cause?
mode2=compile.Mode(gof.CLinker(), copy.copy(compile.mode.OPT_FAST_RUN))
# mode2=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
mode2._optimizer=mode2._optimizer.excluding('local_elemwise_fusion')
# mode2=compile.Mode(gof.OpWiseCLinker(allow_gc=True), compile.mode.OPT_FAST_COMPILE)
s=slice(0,30)
nb_repeat=10
times1=self.do(mode1, shared, shp, gpu=False, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
times2=self.do(mode2, shared, shp, gpu=False, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
print "times1 FAST_RUN",times1, times1.min(), times1.max(), times1.sum()
print "times2 FAST_RUN with CLinker without local_elemwise_fusion optimisation"
print times2, times2.min(), times2.max(), times2.sum()
d=times2/times1
# d.sort()
print "times2/times1",d,d.min(), d.max(), d.mean(), d.std()
if __name__ == '__main__':
unittest.main()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论