added Skipped test for the elemwise fusion that is disabled.

9dd4459a · Frederic Bastien · 160eacf8 · 9dd4459a
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -790,7 +790,201 @@ def test_const_type_in_mul_canonizer():
        f2(ival, wval, visbval, hidbval, betaval, aval),
        f1(ival, wval, visbval, hidbval, betaval, aval))
    
+from theano.compile.sandbox.pfunc import pfunc
+from theano.compile.sandbox.sharedvalue import shared
+import theano_cuda_ndarray as tcn
+import theano

+class test_fusion(unittest.TestCase):
+
+    def do(self, mode, shared_fn, shp, gpu=False, nb_repeat=1, assert_len_topo=True, slice=None):
+        """
+        param shared_fn: if None, will use compile.function
+        verify that the elemwise fusion work
+        Test with and without DimShuffle
+        """
+        #TODO: disable the canonizer?
+        def my_init(shp,dtype, num=0):
+            #ret = numpy.asarray(numpy.random.rand(*shp),dtype=dtype)
+            ret = numpy.zeros(shp, dtype=dtype)+num
+            return ret
+        fw, fx, fy, fz = fmatrices('wxyz')
+        dw, dx, dy, dz = dmatrices('wxyz')
+        fv = fvector('r').dimshuffle('x',0)
+        dv = dvector('s').dimshuffle('x',0)
+        fwv = my_init(shp,'float32',1)
+        fxv = my_init(shp,'float32',2)
+        fyv = my_init(shp,'float32',3)
+        fzv = my_init(shp,'float32',4)
+        fvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float32').reshape(1,shp[0])
+        dwv = my_init(shp,'float64',5)
+#        dxv = my_init(shp,'float64',6)
+#        dyv = my_init(shp,'float64',7)
+#        dzv = my_init(shp,'float64',8)
+#        dvv = numpy.asarray(numpy.random.rand(shp[0]),dtype='float64').reshape(1,shp[0])
+        fwx=fw+fx
+        cases = [
+            (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),
+            (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),
+            (fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
+            (fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
+            (fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            (((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            ((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            ((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            (fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            (fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'),
+            (fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'),
+            (fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'),
+            (fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),
+            (fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),
+            #test with constant
+            ((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            (((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            ((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            ((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            (fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            (2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            #mix float32 and float64
+            (2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
+            (2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
+            (2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),
+            (2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),
+            #test when their is other op then elemwise.
+            #the good output for the next test.
+#            (Pdb) p f.maker.env.toposort()
+#[Elemwise{add,no_inplace}(w, x), Sum(Elemwise{add,no_inplace}.0), InplaceDimShuffle{x,x}(Sum.0), Elemwise{Composite{_impls=[<function <lambda> at 0x2c5c8c0>], nin=4, _c_code={
+#npy_float32 V%(id)s_tmp1;
+#V%(id)s_tmp1 = %(i2)s + %(i3)s;
+#npy_float32 V%(id)s_tmp2;
+#V%(id)s_tmp2 = %(i0)s + %(i1)s;
+#%(o0)s = V%(id)s_tmp2 + V%(id)s_tmp1;
+#}
+#, nout=1, env=[add(add(<float32>, <float32>), add(<float32>, <float32>))]}}(InplaceDimShuffle{x,x}.0, Elemwise{add,no_inplace}.0, y, z)]
+            ((fwx.sum())+(fwx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),4,(fwv+fxv).sum()+fwv+fxv+fyv+fzv,'float32'),
+            #26 elem before this line
+            #test other elemwise op
+            (fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'),
+            (fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'),
+            (fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),
+            (fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'),
+            (fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'),
+            (fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'),
+            (fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow
+            (fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),
+            (fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
+            (fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
+#            (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),#TODO: c_code not implemented for %
+            (fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
+            (fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
+            (fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),
+            (fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),
+#            (fx-(fy==fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),#TODO: bugged
+            (fx-(fy!=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
+            (fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),
+            (fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),
+            (fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'),
+            (fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'),
+            (fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'),
+            (fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
+            (fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),
+            (fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
+#            (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
+
+            #TODO: BIT OP only with ints, xor, or, and, invert
+#            (fx-theano.tensor.or_(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy|fz),'float32'),
+#            (fx-theano.tensor.xor(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy^fz),'float32'),
+
+            ]
+        if slice:
+            cases = cases[slice]
+        import time
+        times=numpy.zeros(len(cases))
+        for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
+            print "new cases", id
+
+            if shared_fn == None:
+                f = compile.function(list(sym_inputs), g,mode=mode)
+                #pre-call to have the data in cache if it fit to don't penalise the first iteration
+#                if id==0:
+#                    out=f(*val_inputs)
+                t0=time.time()
+                for x in range(nb_repeat):
+                    out=f(*val_inputs)
+                t1=time.time()
+                nb_repeat=1
+            else:
+                out=shared_fn(numpy.zeros(shp, dtype=out_dtype),'out')
+                f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
+                #pre-call to have the data in cache if it fit to don't penalise the first iteration
+#                if id==0:
+#                    f(*val_inputs)
+                t0=time.time()
+                for x in range(nb_repeat):
+                    f(*val_inputs)
+                t1=time.time()
+                out=out.value
+#                if id==0:
+#                    nb_repeat+=1
+
+            times[id]=t1-t0
+            assert numpy.allclose(out,answer*nb_repeat,atol=1e-6 if out_dtype=='float32' else 1e-8)
+            topo=f.maker.env.toposort()
+            if gpu:
+                topo = [x for x in topo if not isinstance(x.op,tcn.basic_ops.GpuFromHost)]
+                gpu_ = [x for x in topo if isinstance(x.op,tcn.basic_ops.GpuFromHost)]
+                assert len(gpu_)==len(sym_inputs)
+            if assert_len_topo:
+                assert(len(topo)==nb_elemwise)
+            assert(out_dtype==out.dtype)
+        print "Executed",len(cases),"cases"
+        return times
+    
+    def test_elemwise_fusion(self):
+        raise SkipTest("Current implementation of test_fusion is not enabled. So we skip the corresponding test")
+        shp=(5,5)
+        #we need the optimisation enabled, debug do this.
+        mode=compile.mode.predefined_modes['FAST_COMPILE']
+        mode=compile.mode.predefined_modes['FAST_RUN']
+        mode=compile.mode.predefined_modes['DEBUG_MODE']
+
+        self.do(mode, shared, shp)
+
+    def gpu_fusion(self):
+        shp=(5,5)
+        #we need the optimisation enabled, debug do this.
+        mode=compile.mode.predefined_modes['FAST_COMPILE']
+        mode=compile.mode.predefined_modes['FAST_RUN']
+        mode=compile.mode.predefined_modes['DEBUG_MODE']
+
+        self.do(mode, tcn.shared_constructor, shp, gpu=True)
+
+    def speed_fusion(self):
+        import copy
+        shp=(3000,3000)
+        #mode1=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
+        mode1=compile.Mode(gof.CLinker(), copy.copy(compile.mode.OPT_FAST_RUN))
+        #TODO:clinker is much faster... but use to much memory
+        #Possible cause: as their is do deletion of intermediate value when we don't keep the fct.
+        #More plausible cause: we keep a link to the output data?
+        #Follow up. Clinker do the same... second cause?
+        mode2=compile.Mode(gof.CLinker(), copy.copy(compile.mode.OPT_FAST_RUN))
+#        mode2=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
+        mode2._optimizer=mode2._optimizer.excluding('local_elemwise_fusion')
+#        mode2=compile.Mode(gof.OpWiseCLinker(allow_gc=True), compile.mode.OPT_FAST_COMPILE)
+
+        s=slice(0,30)
+        nb_repeat=10
+        times1=self.do(mode1, shared, shp, gpu=False, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
+        times2=self.do(mode2, shared, shp, gpu=False, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
+        print "times1 FAST_RUN",times1, times1.min(), times1.max(), times1.sum()
+        print "times2 FAST_RUN with CLinker without local_elemwise_fusion optimisation"
+        print times2, times2.min(), times2.max(), times2.sum()
+        d=times2/times1
+#        d.sort()
+        print "times2/times1",d,d.min(), d.max(), d.mean(), d.std()

 if __name__ == '__main__':
    unittest.main()