added a new unrolled verion of ConvOp(not used by default). It unroll the batch…

added a new unrolled verion of ConvOp(not used by default). It unroll the batch and the kernel at the same time. This give the biggest speed up in my test. Also modified the test_multilayer_conf fct to allow testing the different parameter for the unroll.

added a new unrolled verion of ConvOp(not used by default). It unroll the batch…
32bf9b72 · Frederic Bastien · e9adfb12 · 32bf9b72 · 32bf9b72
--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -37,7 +37,7 @@ class ConvOp(Op):
        self.unroll_batch=unroll_batch
        self.unroll_kern=unroll_kern
-        assert not(unroll_batch>0 and unroll_kern>0)
        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
            raise Exception("unroll_batch(%s) should be 0 or a multiple of bsize(%s)"%(str(self.unroll_batch),str(self.bsize)))
        if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
@@ -175,10 +175,14 @@ using namespace std;
        if node.inputs[0].type.dtype=="float32": d["type"]="float"
        elif node.inputs[0].type.dtype=="float64": d["type"]="double"
        else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
-        if self.unroll_kern>0:
+        if self.unroll_kern>0 and self.unroll_batch>0:
+            print "return unrolled batch and kern code by",self.unroll_batch, self.unroll_kern
+            return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
+                                                   self.unroll_kern)
+        elif self.unroll_kern>0:
            print "return unrolled kern code by",self.unroll_kern
            return gen_conv_code_unroll_kern(d, self.unroll_kern)
-        if self.unroll_batch>0:
+        elif self.unroll_batch>0:
            print "return unrolled batch code by",self.unroll_batch
            return gen_conv_code_unroll_batch(d, self.unroll_batch)
@@ -1105,3 +1109,251 @@ Py_XDECREF(img2d);
 Py_XDECREF(filtersflipped);
 """%d
    return ret
+def gen_conv_code_unroll_batch_kern(d,unloop_bsize=1, unloop_ksize=1):
+    """ c_code for ConvOp that unroll the batch size loop
+    """
+    d["unloop_bsize"]=unloop_bsize
+    d["unloop_ksize"]=unloop_ksize
+    def my_dup(st,size):
+        s=""
+        for i in range(size):
+            d["unloop_iter"]=i
+            s+=st%d
+        return s+"\n"
+    def my_dup2(st):
+        s=""
+        iter=0
+        for i in range(unloop_bsize):
+            d["unloop_biter"]=i
+            for j in range(unloop_ksize):
+                d["unloop_kiter"]=j
+                d["unloop_iter"]=iter
+                iter+=1
+                s+=st%d
+        return s+"\n"
+    ret = """
+int mode=-1,typenum=0, typenum_f=0;
+PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
+const %(type)s fill_value = 0;
+int type_im=PyArray_TYPE(%(img2d)s);
+int type_ker=PyArray_TYPE(%(filtersflipped)s);
+npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
+npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
+npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
+PyArray_Dims img2d_shape;
+npy_intp img2d_dim[4]={1,1,0,0};
+img2d_shape.ptr=img2d_dim;
+img2d_shape.len=4;
+PyArray_Dims kerns_shape;
+npy_intp kerns_dim[4]={1,1,0,0};
+kerns_shape.ptr=kerns_dim;
+kerns_shape.len=4;
+PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
+string s="%(self_out_mode)s";
+if(%(img2d)s->nd==2){
+  img2d_dim[3]=%(img2d)s->dimensions[1];
+  img2d_dim[2]=%(img2d)s->dimensions[0];
+}else if(%(img2d)s->nd==3){
+  img2d_dim[3]=%(img2d)s->dimensions[2];
+  img2d_dim[2]=%(img2d)s->dimensions[1];
+  img2d_dim[0]=%(img2d)s->dimensions[0];
+}else if(%(img2d)s->nd==4){
+  img2d_dim[3]=%(img2d)s->dimensions[3];
+  img2d_dim[2]=%(img2d)s->dimensions[2];
+  img2d_dim[1]=%(img2d)s->dimensions[1];
+  img2d_dim[0]=%(img2d)s->dimensions[0];
+}else {
+    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
+    %(fail)s;
+}
+if(%(filtersflipped)s->nd==3){
+  kerns_dim[3]=%(filtersflipped)s->dimensions[2];
+  kerns_dim[2]=%(filtersflipped)s->dimensions[1];
+  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
+}else if(%(filtersflipped)s->nd==4){
+  kerns_dim[3]=%(filtersflipped)s->dimensions[3];
+  kerns_dim[2]=%(filtersflipped)s->dimensions[2];
+  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
+  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
+}else{
+    PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
+    %(fail)s;
+}
+img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
+img2d_arr = (PyArrayObject*)img2d;
+if ((img2d_arr->strides[3] != sizeof(%(type)s)) 
+     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
+    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
+    Py_DECREF(img2d);
+    img2d = contig;
+    if (!PyArray_ISCONTIGUOUS(img2d)){
+        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
+        %(fail)s;
+    }
+}
+img2d_arr = (PyArrayObject*)img2d;
+filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
+filtersflipped_arr = (PyArrayObject*)filtersflipped;
+if ((filtersflipped_arr->strides[3] != sizeof(%(type)s)) 
+     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
+    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
+    Py_DECREF(filtersflipped);
+    filtersflipped = contig;
+    if (!PyArray_ISCONTIGUOUS(filtersflipped)){
+        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
+        %(fail)s;
+    }
+}
+filtersflipped_arr = (PyArrayObject*)filtersflipped;
+if(s=="valid") mode=0;
+else if(s=="full") mode=2;
+else {PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;};
+typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
+typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
+if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
+if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
+if (!img2d) %(fail)s;
+if (!filtersflipped) %(fail)s;
+if ((!%(z)s)
+  || *PyArray_DIMS(%(z)s)!=4
+  ||(%(z)s->dimensions[0] != %(self_bsize)s)
+  ||(%(z)s->dimensions[1] != %(self_nkern)s)
+  ||(%(z)s->dimensions[2] != dim_zz[0])
+  || (%(z)s->dimensions[3] != dim_zz[1])
+  )
+{
+  if (%(z)s) Py_DECREF(%(z)s);
+  npy_intp dims[4] = {0,0,0,0};
+  if(!dims) %(fail)s;
+  dims[0]=%(self_bsize)s;
+  dims[1]=%(self_nkern)s;
+  dims[2]=dim_zz[0];
+  dims[3]=dim_zz[1];
+  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
+}else{
+  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
+}
+int Os[2];
+if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
+else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
+for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
+  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unloop_ksize)s){
+    //assertions
+    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
+"""%d
+    ret+=my_dup2("%(type)s * __restrict__ out%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unloop_biter)s,n_kern+%(unloop_kiter)s));")
+    ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unloop_iter)s[i] = 0;",unloop_bsize*unloop_ksize)
+    ret+="""
+    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
+"""%d
+    ret+=my_dup("const %(type)s * __restrict__ in%(unloop_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unloop_iter)s,stack_size));", unloop_bsize)
+    ret+=my_dup("const %(type)s * __restrict__ hvals%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unloop_iter)s,stack_size));",unloop_ksize)
+    ret+="""
+      int new_m;
+      for (int m=0; m < Os[0]; m++) {
+        // Reposition index into input image based on requested output size
+        if (mode == FULL) new_m = m ;
+        else new_m = (m+dim_ker[0]-1);
+        for (int n=0; n < Os[1]; n++) {  // loop over columns 
+        """%d
+    ret+=my_dup("%(type)s sum%(unloop_iter)s=0;", unloop_bsize*unloop_ksize)
+    ret+="""
+          // Sum over kernel, if index into image is out of bounds
+          // fill with the value
+          for (int j=0; j < dim_ker[0]; j++) {
+            int ind0 = (new_m-j);
+            if(mode==FULL){
+"""%d
+    ret+=my_dup("const %(type)s * idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];",unloop_ksize)
+    ret+="""
+              if(ind0 < 0 || ind0 >= dim_im[0]){
+                if(fill_value!=0)
+                  for (int k=0; k < dim_ker[1]; k++) {
+"""%d
+    ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;")
+    ret+="""
+                  }
+              }else{
+                //do the part where kernel is to the right of the img
+                int k=0,max_k=max((int)(n-dim_im[1])+1,0);
+                if(fill_value!=0){ 
+                  for(k=0;k<max_k;k++){
+"""%d
+    ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;")
+    ret+="""
+                  }
+                }else {k=max_k;}
+                //do the part where the kernel is on the img
+                max_k=min(n+1,(int)dim_ker[1]);
+"""%d
+    ret+=my_dup("const %(type)s * idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];", unloop_bsize)
+    ret+="""
+                for (int ind1=n-k; k<max_k; k++,ind1--) {
+"""%d
+    ret+=my_dup2("sum%(unloop_iter)s+= idx_hvals%(unloop_kiter)s[k] * idx_in%(unloop_biter)s[ind1];")
+    ret+="""
+                }
+                //do the part to the left of the img
+                if(fill_value!=0)
+                  for(;k<dim_ker[1];k++){
+"""%d
+    ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;")
+    ret+="""
+                  }
+              }
+            }else{
+"""%d
+    ret+=my_dup("const %(type)s* idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];", unloop_bsize)
+    ret+=my_dup("const %(type)s* idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];",unloop_ksize)
+    ret+="""
+              int new_n = (n+dim_ker[1]-1);
+              for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
+"""%d
+    ret+=my_dup2("sum%(unloop_iter)s+=idx_hvals%(unloop_kiter)s[k]*idx_in%(unloop_biter)s[last];")
+    ret+="""
+              }
+            }
+          }//for j
+"""%d
+#    ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;", unloop_bsize)
+    ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;", unloop_bsize*unloop_ksize)
+#        ret+=my_dup("cout<<sum%(unloop_iter)s<<endl;",unloop_bsize)
+    ret+="""
+        }//for n
+      }//for m
+    }//for stack_size
+  }//for n_kern
+}//for b
+Py_XDECREF(img2d);
+Py_XDECREF(filtersflipped);
+"""
+    return ret
--- a/theano/sandbox/test_conv.py
+++ b/theano/sandbox/test_conv.py
@@ -207,13 +207,13 @@ class TestConvOp(unittest.TestCase):
        ssizes = [(1,1),(2,2)]#2,2)]
        #test speed
-        bsize = 10 # batch size
+#        bsize = 10 # batch size
-        imshp_start = (1,50,49)
+#        imshp_start = (1,50,49)#un square shape to test more corner case.
-        kshps = ([11,12],[12,11])
+#        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-        nkerns = [20,20] # per output pixel
+#        nkerns = [20,20] # per output pixel
-        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
+#        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-        convmodes = ['valid','full']
+#        convmodes = ['valid','full']
-        do_theano=False
+#        do_theano=False
        N.set_printoptions(threshold=N.nan)
@@ -221,23 +221,25 @@ class TestConvOp(unittest.TestCase):
        kerns = [T.matrix(),T.dmatrix()]
        img = T.dmatrix()
        rng = N.random.RandomState(3423489)
-        tctot, tpytot, t2ctot, t2pytot, ntot, convtot = [], [], [], [], [], []
+        tctot, tpytot, ntot = [], [], []
        dmatrix4=T.TensorType('float64', (False, False, False, False))
        inputs4=dmatrix4()
        kerns4=dmatrix4()
        assert len(kshps)==len(nkerns)==len(kerns)
-        for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
+        def do_test(conv_mode, ss, unroll_batch=0, unroll_kern=0, img=img):
-            for ss, n_ss in zip(ssizes,range(len(ssizes))):
                # build actual input images
                imgval = rng.rand(bsize, imshp_start[0], imshp_start[1], imshp_start[2])
                imshp=imshp_start
                # for each layer
-                for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(kerns))):
+                ntot=0 
+                tctot=0
+                tpytot=0
+                for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(kerns))):
                    print '************* layer %i ***************' % n_layer
                    print conv_mode, ss, n_layer, kshp, nkern
@@ -266,7 +268,7 @@ class TestConvOp(unittest.TestCase):
                            for i in range(imshp[0]): # loop over input feature maps
                                outval[b,n,...] +=  _convolve2d(\
                                    imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]]
-                    ntot += [time.time() - time1]
+                    ntot += time.time() - time1
                    if do_theano:
                        ####### test with new sp.convolve2 function ######
@@ -290,14 +292,11 @@ class TestConvOp(unittest.TestCase):
                    else:
                        hid = img #we don't need it, but it make the flow easier flow
-                        convtot+=[-1]
-                        tctot+=[-1]
-                        tpytot+=[-1]
                        hidval=outval.copy()#to keep the same memory
                        hidval1=outval.copy()
                    # ConvOp
-                    conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_kern=10)(inputs4, kerns4)
+                    conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
                    l1shp=N.hstack((nkern,
                                    getFilterOutShp(imshp, kshp, ss, conv_mode)))
                    propup2 = function([inputs4, kerns4], conv_op)
@@ -306,12 +305,12 @@ class TestConvOp(unittest.TestCase):
                    time1 = time.time()
                    hidval2_ = propup2(imgval,w_flip)
                    hidval2 = hidval2_[:,:,0::ss[0],0::ss[1]]
-                    t2ctot += [time.time() - time1]
+                    tctot += time.time() - time1
                    time1 = time.time()
 #                    hidval3_ = propup3(imgval,w_flip)
 #                    hidval3 = hidval3_[:,:,0::ss[0],0::ss[1]]
-                    t2pytot += [time.time() - time1]
+                    tpytot += time.time() - time1
 #                    assert (N.abs(hidval2-hidval3)<1e-5).all()
                    temp = N.abs(outval - hidval2)
@@ -322,14 +321,47 @@ class TestConvOp(unittest.TestCase):
                    img, imshp = hid, tuple(outshp)
                    imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2])
+                return tctot, tpytot, ntot
+        if False:
+            unroll_batch = [0,1,2,5,10]
+            unroll_kern = [0,1,2,5,10,20]
+            # calculate the speed up of different combination of unroll
+            for unroll_b in unroll_batch:
+                for unroll_k in unroll_kern:
+                    tctot, tpytot, ntot=[],[],[]
+                    for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
+                        for ss, n_ss in zip(ssizes,range(len(ssizes))):
+                            tctot_, tpytot_, ntot_ = do_test(conv_mode, ss,unroll_batch=unroll_b, unroll_kern=unroll_k)
+                            tctot+=[tctot_]
+                            tpytot+=[tpytot_]
+                            ntot+=[ntot_]
+                    print '**** Multilayer Convolution Profiling Results ****'
+                    print 'unroll batch', unroll_b, 'unroll kern',unroll_k
+                    print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
+                    print 'c Theano(ConvOp) processing time: %.3fs'%sum(tctot),tctot
+                    print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
+                    d=N.asarray(ntot)/tctot
+                    print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d
+            return
+        for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
+            for ss, n_ss in zip(ssizes,range(len(ssizes))):
+                tctot_, tpytot_, ntot_ = do_test(conv_mode, ss)
+                tctot+=[tctot_]
+                tpytot+=[tpytot_]
+                ntot+=[ntot_]
        print '**** Multilayer Convolution Profiling Results ****'
        print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
-        print 'c Theano(ConvOp) processing time: %.3fs'%sum(t2ctot),t2ctot
+        print 'c Theano(ConvOp) processing time: %.3fs'%sum(tctot),tctot
-        print 'py Theano(ConvOp) processing time: %.3fs'%sum(t2pytot),t2pytot
+        print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
-        print 'convolve processing time: %.3fs'%sum(convtot),convtot
+        d=N.asarray(ntot)/tctot
-        d=N.asarray(ntot)/t2ctot
        print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d
-        d=N.asarray(ntot)/t2pytot
+        d=N.asarray(ntot)/tpytot
        print 'speed up py theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d