ConvOp select automatically the fastest c code algo when none are specified.

The timming is done on maggie, so it could differ on other computer.

ConvOp select automatically the fastest c code algo when none are specified.
8b9676e3 · Frederic Bastien · 94c8bc56 · 8b9676e3 · 8b9676e3
--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -197,6 +197,69 @@ class ConvOp(Op):
            'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
    """These attributes uniquely identify the behaviour of this op for given inputs"""

+#the value of speed_unroll_batch_kern,speed_unroll_patch_noshape,speed_unroll_patch_shape
+#have bean calculated on maggie36 when their is only 1 session logged on and only this was running.
+#It is an Intel(R) Xeon(R) CPU E5430 @ 2.66GHz. It is computer with theano/tensor/nnet/tests/speed_test_conv.py
+# and took 5 minutes to run.
+#TODO: we should compute this table for each computer/os as this can change.
+#      I saw on one computer that the speed with the shape can be slower then without!
+#      using the real shape and the same dtype could also help.
+
+#unroll_batch, unroll_kern, valid time, full time
+    speed_unroll_batch_kern=[(1, 1, 2.4661250114440918, 6.5472931861877441) ,
+(1, 2, 1.5869178771972656, 5.1499760150909424) ,
+(1, 3, 1.4270510673522949, 3.6593470573425293) ,
+(1, 4, 1.3373479843139648, 3.3451821804046631) ,
+(1, 5, 1.2818830013275146, 3.1444568634033203) ,
+(1, 6, 1.2521560192108154, 3.0256359577178955) ,
+(1, 10, 1.2134110927581787, 2.9174180030822754) ,
+(2, 1, 1.657214879989624, 4.5261678695678711) ,
+(2, 2, 1.2123160362243652, 2.9747390747070312) ,
+(2, 3, 1.0758891105651855, 2.5690360069274902) ,
+(2, 4, 1.0683329105377197, 2.4233770370483398) ,
+(2, 5, 1.0955719947814941, 2.3999948501586914) ,
+(2, 6, 1.5935721397399902, 2.6878271102905273) ,
+(2, 10, 1.8511250019073486, 3.2417428493499756) ,
+(3, 1, 1.5948119163513184, 3.631148099899292) ,
+(3, 2, 1.0761330127716064, 2.6011371612548828) ,
+(3, 3, 1.0551531314849854, 2.4200370311737061) ,
+(3, 4, 1.3930759429931641, 2.5211219787597656) ,
+(3, 5, 1.4330689907073975, 2.5704989433288574) ,
+(3, 6, 1.362138032913208, 2.5964410305023193) ,
+(3, 10, 1.6582000255584717, 2.9907989501953125) ,
+(4, 1, 1.4793620109558105, 3.3473429679870605) ,
+(4, 2, 1.0671560764312744, 2.4171769618988037) ,
+(4, 3, 1.2569692134857178, 2.2807950973510742) ,
+(4, 4, 1.3456289768218994, 2.6219108104705811) ,
+(4, 5, 1.4055080413818359, 2.4606490135192871) ,
+(4, 6, 1.372107982635498, 2.551663875579834) ,
+(4, 10, 1.599470853805542, 2.9172940254211426) ,
+(5, 1, 1.4115700721740723, 3.2077109813690186) ,
+(5, 2, 1.0635769367218018, 2.2648060321807861) ,
+(5, 3, 1.3842809200286865, 2.6135518550872803) ,
+(5, 4, 1.3470511436462402, 2.3852400779724121) ,
+(5, 5, 1.3539440631866455, 2.5245928764343262) ,
+(5, 6, 1.4037849903106689, 2.5985310077667236) ,
+(5, 10, 1.6120610237121582, 2.8127608299255371) ,
+(6, 1, 1.3623628616333008, 3.021122932434082) ,
+(6, 2, 1.1697649955749512, 2.6285450458526611) ,
+(6, 3, 1.2980999946594238, 2.4746189117431641) ,
+(6, 4, 1.3739941120147705, 2.5579929351806641) ,
+(6, 5, 1.3967819213867188, 2.5522029399871826) ,
+(6, 6, 1.4279270172119141, 2.6127138137817383) ,
+(6, 10, 1.605496883392334, 2.864037036895752) ,
+(10, 1, 1.6401121616363525, 2.970099925994873) ,
+(10, 2, 1.46710205078125, 2.7231831550598145) ,
+(10, 3, 1.4193780422210693, 2.6087639331817627) ,
+(10, 4, 1.4657118320465088, 2.6246678829193115) ,
+(10, 5, 1.5052611827850342, 2.6542458534240723) ,
+(10, 6, 1.5214400291442871, 2.7243161201477051) ,
+(10, 10, 1.6116268634796143, 2.956165075302124)]
+
+    #valid time, full time
+    speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828]
+    #valid time, full time
+    speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812]
    
    def c_compile_args(self):
        #when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3.
@@ -232,9 +295,11 @@ class ConvOp(Op):

    def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, 
            dx=None, dy=None,
-            output_mode='valid', unroll_batch=0,
-            unroll_kern=0,
-            unroll_patch=True,
+            output_mode='valid',
+
+            unroll_batch=None,
+            unroll_kern=None,
+            unroll_patch=None,
            imshp_logical=None,
            kshp_logical=None,
            kshp_logical_top_aligned=True,
@@ -246,10 +311,16 @@ class ConvOp(Op):
        code.

        NOTES ON OPTIMIZATION:
-        If ALL (imshp, kshp, nkern and bsize) parameters are provided, we can
-        generate faster c-code. This make a significant difference for the
-        'full' output_mode with unroll_patch=True. The current fastest
-        implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
+        Their is two type of optimization. The first is the selection of the
+        fastest algo when bsize and nkern are probided with imshp and kshp.
+        By default we try to select the fastest version. You can specify it
+        with the unroll_batch, unroll_kern, and unroll_patch parameter.
+
+        The second type of optimization is hardcoding some dimensions into the code
+        when all shape are know.
+        This make a significant difference for the 'full' output_mode.
+
+        Some times, the fastest implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
        unroll_patch=False} with all other shape parameters being provided.

        For optimizing other architectures, see:
@@ -351,6 +422,7 @@ class ConvOp(Op):
        self.unroll_kern=unroll_kern
        self.unroll_patch=unroll_patch

+        #downcast unroll_batch if not a divisor of batch size
        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:

            if self.bsize<=self.unroll_batch:
@@ -364,12 +436,13 @@ class ConvOp(Op):

                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
                      "must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
-                      "won't change the result, but may make it slower."
+                      " won't change the result, but may make it slower."
                _warn(warnstr % (self.unroll_batch, self.bsize, new))

                self.unroll_batch=new

-        if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
+        #downcast unroll_kern if not a divisor of nb of kernel
+        if self.unroll_kern>0 and self.nkern % self.unroll_kern!=0:

            if self.nkern<=self.unroll_kern:
                self.unroll_kern = self.nkern
@@ -404,6 +477,41 @@ class ConvOp(Op):
                    "(Hint: kerns must fit inside image in valid mode)")%
                    (self.imshp_logical,self.kshp_logical))

+        if self.unroll_kern is None and self.unroll_batch is None and self.unroll_patch is None:
+            #no version specified. Find the faster we have
+            if self.bsize is None and self.nkern is None:
+                self.unroll_patch = True
+            elif self.bsize is not None and self.nkern is not None:
+                bsize=self.bsize
+                nkern=self.nkern
+                if bsize is None:
+                    bsize=1
+                if nkern is None:
+                    nkern=1
+                mode_idx=0
+                if self.out_mode!="valid":
+                    mode_idx=1
+                if all_shape:
+                    time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
+                else:
+                    time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx]
+                time_unroll_batch_kern = 9999999
+                for i in range(len(self.speed_unroll_batch_kern)):
+                    if bsize%self.speed_unroll_batch_kern[i][0]==0 and nkern%self.speed_unroll_batch_kern[i][1]==0:
+                        if self.speed_unroll_batch_kern[i][2+mode_idx]<time_unroll_batch_kern:
+                            time_unroll_batch_kern=self.speed_unroll_batch_kern[i][2+mode_idx]
+                            time_unroll_batch_kern_idx=i
+                if time_unroll_patch < time_unroll_batch_kern:
+                    self.unroll_patch = True
+                else:
+                    self.unroll_batch=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][0]
+                    self.unroll_kern=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][1]
+                    self.unroll_patch = False
+
+            print "AUTO FIND VERSION OF C_CODE OF CONV OP"
+            print self.unroll_batch, self.unroll_kern, self.unroll_patch, self.bsize, self.nkern, time_unroll_patch, time_unroll_batch_kern
+
+
        self._rehash()
        if config.op.set_flops:
            self.set_flops()
@@ -673,7 +781,7 @@ class ConvOp(Op):
                _warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
                      "a good unroll value for the batch. Maybe you can optimize this!")

-        if un_k!=0 and nkern%un_k!=0:
+        if all_shape and un_k!=0 and nkern%un_k!=0:
            if nkern<un_k:
                un_k = nkern
            else:
@@ -740,7 +848,7 @@ class ConvOp(Op):
        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]

    def c_code_cache_version(self):
-        return (1)
+        return (2)
    
    def c_support_code(self):
        return """

--- a/theano/tensor/nnet/tests/speed_test_conv.py
+++ b/theano/tensor/nnet/tests/speed_test_conv.py
@@ -42,7 +42,7 @@ global_rng = N.random.RandomState(3423489)

 dmatrix4=T.TensorType('float64', (False, False, False, False))

-def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, 
+def exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp, kshps, nkerns, 
        unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, 
        conv_op_py=False, do_print=True, repeat=1, 
        unroll_patch=False, unroll_patch_size=False, verbose=0):
@@ -136,15 +136,72 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns,

        return tctot, tpytot, ntot

+def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, 
+        unroll_batch=0, unroll_kern=0, img=T.dmatrix(),
+        do_print=True, repeat=1, 
+        unroll_patch=False, unroll_patch_size=False, verbose=0):
+
+        # build actual input images
+        imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2])
+
+        a=T.dmatrix()
+        kerns = [a for i in nkerns]
+        inputs4=dmatrix4()
+        kerns4=dmatrix4()
+
+        # for each layer
+        ntot=0 
+        tctot=0
+        tpytot=0
+
+        for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))):
+            if do_print:
+                print '************* layer %i ***************' % n_layer
+                
+                print conv_mode, ss, n_layer, kshp, nkern
+
+            # actual values
+            w = global_rng.random_sample(N.r_[nkern,imshp[0],kshp])
+            w_flip = flip(w,kshp).reshape(w.shape)
+
+            outshp = N.hstack((nkern, ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)))
+
+            time1 = time.time()
+            outval = N.zeros(N.r_[bsize,outshp])
+
+            # ConvOp
+            if unroll_patch and not unroll_patch_size:
+                conv_op = ConvOp(dx=ss[0],dy=ss[1], output_mode=conv_mode,
+                                 unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
+            else:
+                conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode,
+                                 unroll_batch=unroll_batch, unroll_kern=unroll_kern, unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
+            l1shp=N.hstack((nkern,
+                            ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)))
+            propup2 = function([inputs4, kerns4], conv_op)
+
+            time1 = time.time()
+            for i in range(repeat):
+                hidval2_ = propup2(imgval,w_flip)
+            hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
+            tctot += time.time() - time1
+
+            imshp = tuple(outshp)
+            imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2])
+
+        return tctot, tpytot, ntot
+
+

 def speed_multilayer_conv():
        # calculate the speed up of different combination of unroll
        # put the paramter to the same you will try. 
        
        validate=False# we don't validate the result to have it much faster!
+        repeat = 3
        verbose=1
-        unroll_batch = [1,2,3,4,5,10]#15, 30, 60 always much slower
-        unroll_kern = [1,2,3,4,5,10]#15, 30, 60 always much slower
+        unroll_batch = [1,2,3,4,5,6,10]#15, 30, 60 always much slower
+        unroll_kern = [1,2,3,4,5,6,10]#15, 30, 60 always much slower
        #unroll_batch = [1,4,5]
        #unroll_kern = [1,4,5]
        #unroll_batch = [1,4]
@@ -153,8 +210,8 @@ def speed_multilayer_conv():
        
        bsize = 60 # batch size
        imshp_start = (1,48,48)#un square shape to test more corner case.
-        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-        nkerns = [60,60] # per output pixel
+        kshps = ([11,12],)#un square shape to test more corner case.
+        nkerns = [60] # per output pixel
        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
        convmodes = ['valid','full']
        do_convolve2=False
@@ -168,9 +225,6 @@ def speed_multilayer_conv():
        #calculate the timing with unrolling

        print 'time unroll batch kern'
-        t_=[[ 7.60572791,  3.95069814,  3.74271464], [ 4.05631089,  2.90384555,  2.93613672], [ 3.90551591,  2.92595196,  3.00102282]]
-        best=[0.52690219879150391, 2.4266397953033447]
-        worst=[0.92042708396911621, 6.8822150230407715]
        best=[]
        worst=[]
        t_=[]
@@ -181,7 +235,8 @@ def speed_multilayer_conv():
                    tctot, tpytot, ntot=[],[],[]
                    for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
                        for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                            tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False)
+#                            tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False)
+                            tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, verbose=verbose,do_print=False, repeat=repeat)
                            tctot+=[tctot_]
                            tpytot+=[tpytot_]
                            ntot+=[ntot_]
@@ -199,13 +254,13 @@ def speed_multilayer_conv():
        t=N.asarray(t)
        #calculate the old timing
        print 'time old version'
-        tctot_=[0.52555489540100098, 6.6634182929992676]
        tctot,tpytot,ntot=[],[],[]
        tctot_=[]
        if not tctot_:
            for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
                for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                    tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False)
+#                    tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False)
+                    tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, verbose=verbose,do_print=False, repeat=repeat)
                    tctot+=[tctot_]
                    tpytot+=[tpytot_]
                    ntot+=[ntot_]
@@ -217,7 +272,7 @@ def speed_multilayer_conv():
        print "unroll_batch/unroll_kern valid_mode full_mode"
        for n_b in range(len(unroll_batch)):
            for n_k in range(len(unroll_kern)):
-                print unroll_batch[n_b],"/",unroll_kern[n_k], " ",t[n_b,n_k]
+                print (unroll_batch[n_b], unroll_kern[n_k]) + tuple(t[n_b,n_k]),','
        t_detail=t
        t = t.sum(axis=2)
        print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]
@@ -231,9 +286,11 @@ def speed_multilayer_conv():
        tctot_patch_size = []
        for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
            for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False)
+                #tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False)
+                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, unroll_patch=True,verbose=verbose,do_print=False, repeat=repeat)
                tctot_patch += [tctot_]
-                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True)
+                #tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True)
+                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True, repeat=repeat)
                tctot_patch_size += [tctot_]

        t_patch=sum(tctot_patch)