[mq]: unroll_patch

dc1aa62a · Frederic Bastien · 3d8a5430 · dc1aa62a · dc1aa62a
--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -29,9 +29,10 @@ class ConvOp(Op):

    #TODO: make the stacksize its own parameter, and make imshp a pair

-    def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid',
-            unroll_batch=4,
-            unroll_kern=4,
+    def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, dx=None, dy=None, output_mode='valid',
+            unroll_batch=0,
+            unroll_kern=0,
+            unroll_patch=False,
            imshp_logical=None,
            kshp_logical=None,
            kshp_logical_top_aligned=True,
@@ -47,6 +48,7 @@ class ConvOp(Op):
        dx - patch stride rows
        dy - patch stride cols
        out_mode - 'valid', 'full'
+        unroll_patch - c code generation option
        unroll_batch - c code generation option
        unroll_kern - c code generation option
        verbose - passed to GpuConv
@@ -60,6 +62,7 @@ class ConvOp(Op):
        gradient on the filters.


+        unroll_patch. If True will use a version that is faster then without not unroll by unroll the patch loop.
        unroll_batch. If >0 will use a version that will unroll the batch loop by the value of the option. By default don't use this version of the code.
        unroll_nkern. idem as unroll_batch but unroll the kernel loop.

@@ -95,6 +98,7 @@ class ConvOp(Op):

        self.unroll_batch=unroll_batch
        self.unroll_kern=unroll_kern
+        self.unroll_patch=unroll_patch

        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
            if self.bsize<=self.unroll_batch:
@@ -407,6 +411,7 @@ using namespace std;
        d["self_imshp0"]=self.imshp[0]
        d["self_imshp1"]=self.imshp[1]
        d["self_imshp2"]=self.imshp[2]
+        d["mode"]=self.out_mode.upper()
        d["self_kshp0"]=self.kshp[0]
        d["self_kshp1"]=self.kshp[1]
        d["self_kshp_logical_r"] = self.kshp_logical[0]
@@ -439,8 +444,12 @@ using namespace std;
        #print self.out_mode, d["self_imshp_logical_stride_r"]

        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
+#            print "return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version"
            return _conv_op_code_a % d

+        if self.unroll_patch:
+#            print "return unroll patch version",self.dx,self.dy
+            return _conv_op_code_unroll_patch%d
        if self.unroll_batch>0 or self.unroll_kern>0:
            if self.unroll_batch<=0: self.unroll_batch=1
            if self.unroll_kern<=0: self.unroll_kern=1
@@ -1212,3 +1221,295 @@ Py_XDECREF(img2d);
 Py_XDECREF(filtersflipped);
 """
    return ret
+
+_conv_op_code_unroll_patch = """
+const int mode=%(mode)s;
+int typenum=0, typenum_f=0;
+PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
+const %(type)s fill_value = 0;
+
+int type_im=PyArray_TYPE(%(img2d)s);
+int type_ker=PyArray_TYPE(%(filtersflipped)s);
+
+npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
+npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
+npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
+
+PyArray_Dims img2d_shape;
+npy_intp img2d_dim[4]={1,1,0,0};
+img2d_shape.ptr=img2d_dim;
+img2d_shape.len=4;
+
+PyArray_Dims kerns_shape;
+npy_intp kerns_dim[4]={1,1,0,0};
+kerns_shape.ptr=kerns_dim;
+kerns_shape.len=4;
+PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
+
+if(%(img2d)s->nd==2){
+  img2d_dim[3]=%(img2d)s->dimensions[1];
+  img2d_dim[2]=%(img2d)s->dimensions[0];
+}else if(%(img2d)s->nd==3){
+  img2d_dim[3]=%(img2d)s->dimensions[2];
+  img2d_dim[2]=%(img2d)s->dimensions[1];
+  img2d_dim[0]=%(img2d)s->dimensions[0];
+}else if(%(img2d)s->nd==4){
+  img2d_dim[3]=%(img2d)s->dimensions[3];
+  img2d_dim[2]=%(img2d)s->dimensions[2];
+  img2d_dim[1]=%(img2d)s->dimensions[1];
+  img2d_dim[0]=%(img2d)s->dimensions[0];
+}else {
+    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
+    %(fail)s;
+}
+
+if(%(filtersflipped)s->nd==3){
+  kerns_dim[3]=%(filtersflipped)s->dimensions[2];
+  kerns_dim[2]=%(filtersflipped)s->dimensions[1];
+  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
+}else if(%(filtersflipped)s->nd==4){
+  kerns_dim[3]=%(filtersflipped)s->dimensions[3];
+  kerns_dim[2]=%(filtersflipped)s->dimensions[2];
+  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
+  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
+}else{
+    std:stringstream temp;
+    temp << "nddim="<<%(filtersflipped)s->nd;
+    std::string param = temp.str();
+    PyErr_SetString(PyExc_ValueError,
+      ("kernel don't have a good shape. " + param).c_str());
+    %(fail)s;
+}
+
+img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
+img2d_arr = (PyArrayObject*)img2d;
+if ((img2d_arr->strides[3] != sizeof(%(type)s)) 
+     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
+    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
+    Py_DECREF(img2d);
+    img2d = contig;
+    if (!PyArray_ISCONTIGUOUS(img2d)){
+        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
+        %(fail)s;
+    }
+}
+img2d_arr = (PyArrayObject*)img2d;
+
+filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
+filtersflipped_arr = (PyArrayObject*)filtersflipped;
+if ((filtersflipped_arr->strides[3] != sizeof(%(type)s)) 
+     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
+    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
+    Py_DECREF(filtersflipped);
+    filtersflipped = contig;
+    if (!PyArray_ISCONTIGUOUS(filtersflipped)){
+        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
+        %(fail)s;
+    }
+}
+filtersflipped_arr = (PyArrayObject*)filtersflipped;
+
+if(mode != VALID && mode != FULL){
+  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
+}
+typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
+typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
+if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
+if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
+
+if (!img2d) %(fail)s;
+if (!filtersflipped) %(fail)s;
+if ((!%(z)s)
+  || *PyArray_DIMS(%(z)s)!=4
+  ||(%(z)s->dimensions[0] != %(self_bsize)s)
+  ||(%(z)s->dimensions[1] != %(self_nkern)s)
+  ||(%(z)s->dimensions[2] != dim_zz[0])
+  || (%(z)s->dimensions[3] != dim_zz[1])
+  )
+{
+  if (%(z)s) Py_DECREF(%(z)s);
+  npy_intp dims[4] = {0,0,0,0};
+  if(!dims) %(fail)s;
+  dims[0]=%(self_bsize)s;
+  dims[1]=%(self_nkern)s;
+  dims[2]=dim_zz[0];
+  dims[3]=dim_zz[1];
+  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
+}else{
+  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
+}
+
+int Os[2];
+Os[0]=%(self_outshp0)s;
+Os[1]=%(self_outshp1)s;
+//I keep the formula to calculte Os in case we need it in the futur.
+//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
+//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
+
+for(int b=0;b< %(self_bsize)s;b++){
+  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
+
+    //assertions
+    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
+
+    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
+    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
+
+    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
+
+      const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
+      const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
+
+      int new_m;
+
+      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
+        // Reposition index into input image based on requested output size
+        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
+        if (mode == FULL) new_m = pos_m ;
+        else new_m = (pos_m+dim_ker[0]-1);
+
+        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
+          int pos_n=iter_n*%(self_dy)s;
+          %(type)s sum=0;
+          %(type)s sum2=0;
+          %(type)s sum3=0;
+          %(type)s sum4=0;
+          int nb_sum=0;
+          // Sum over kernel, if index into image is out of bounds
+          // fill with the value
+          for (int j=0; j < dim_ker[0]; j++) {
+            int ind0 = (new_m-j);
+
+            if(mode==FULL){
+              const %(type)s * idx_hvals=&hvals[j*dim_ker[1]];
+              if(ind0 < 0 || ind0 >= dim_im[0]){
+                if(fill_value!=0)
+                  for (int k=0; k < dim_ker[1]; k++) {
+                    sum+= idx_hvals[k] * fill_value;
+                  }
+              }else{
+                //do the part where kernel is to the right of the img
+//TODO: implement unroll patch for fill_value!=0
+                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
+                if(fill_value!=0){ 
+                
+                  for(k=0;k<max_k;k++){
+                    sum+= idx_hvals[k]*fill_value;
+                  }
+                }else {k=max_k;}
+                
+                //do the part where the kernel is on the img
+                max_k=min(pos_n+1,(int)dim_ker[1]);
+                const %(type)s * idx_in=&in[ind0*dim_im[1]];
+
+                if(iter_n + 4*%(self_dy)s < Os[1]
+                         && iter_n>dim_ker[1]-1+3 
+                         && iter_n<dim_im[1]-dim_ker[1]+1-3){
+                  nb_sum=4;
+//cout<<4<<endl;
+                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
+                    sum+=idx_hvals[k]*idx_in[ind1];
+                    sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
+                    sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s];
+                    sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s];
+                  }
+                }else if(iter_n + 2*%(self_dy)s < Os[1] 
+                         && iter_n>dim_ker[1]-1
+                         && iter_n<dim_im[1]-dim_ker[1]+1){
+//cout<<2<<endl;
+                  nb_sum=2;
+//                  if(iter_n==dim_ker[1]-1){//k-1<min(pos_n+%(self_dy)s,(int)dim_ker[1])){
+//                    sum2+=idx_hvals[k-1]*idx_in[pos_n-k-%(self_dy)s];
+//                  }
+                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
+                    sum+=idx_hvals[k]*idx_in[ind1];
+                    sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
+                  }
+//                  sum2+=idx_hvals[k]*idx_in[pos_n-k+%(self_dy)s];
+//                  sum+=idx_hvals[k]*idx_in[pos_n-k];
+//                  k++;
+                }else{
+//cout<<1<<endl;
+                  nb_sum=1;
+                  /*
+                  %(type)s sum_=0;
+                  if((k-max_k) & 0x1 != 0){
+                    sum+= idx_hvals[k] * idx_in[pos_n-k];
+                  }
+                  for (int ind1=pos_n-k; k<max_k; k+=2,ind1-=2) {
+                    sum+= idx_hvals[k] * idx_in[ind1];
+                    sum_+= idx_hvals[k+1] * idx_in[ind1-1];
+                  }
+                  sum+=sum_;
+                  */
+                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
+                    sum+=idx_hvals[k]*idx_in[ind1];
+                  }
+                }
+                //do the part to the left of the img
+                if(fill_value!=0)
+                  for(;k<dim_ker[1];k++) sum+= idx_hvals[k]*fill_value;
+              }
+            }else{//valid mode
+              const %(type)s* idx_in=&in[ind0*dim_im[1]];
+              const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
+              if(iter_n + 4*%(self_dy)s < Os[1]){
+                nb_sum=4;
+                for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
+                  sum+=idx_hvals[k]*idx_in[im_idx];
+                  sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
+                  sum3+=idx_hvals[k]*idx_in[im_idx+2*%(self_dy)s];
+                  sum4+=idx_hvals[k]*idx_in[im_idx+3*%(self_dy)s];
+                }
+              }else if(iter_n + 2*%(self_dy)s < Os[1]){
+                nb_sum=2;
+                for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
+                  sum+=idx_hvals[k]*idx_in[im_idx];
+                  sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
+                }
+              }else{
+                nb_sum=1;
+                for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
+                  sum+=idx_hvals[k]*idx_in[im_idx];
+                }
+              }
+            }//else valid mode
+          }//for j
+          switch(nb_sum){
+          case 4: out[iter_m*dim_zz[1]+iter_n+3] %(affectation)s sum4;
+          case 3: out[iter_m*dim_zz[1]+iter_n+2] %(affectation)s sum3;
+          case 2: out[iter_m*dim_zz[1]+iter_n+1] %(affectation)s sum2;
+          case 1: out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
+          }
+          iter_n+=nb_sum-1;
+/*
+          out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
+          if(nb_sum>=2){
+            iter_n++;
+            out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum2;
+          }
+          if(nb_sum>=3){
+            iter_n++;
+            out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum3;
+          }
+          if(nb_sum>=4){
+            iter_n++;
+            out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum4;
+          }
+*/
+        }//for iter_n
+      }//for iter_m
+    }//for stack_size
+    if (0 && (mode==FULL)){
+      for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) 
+        std::cout << " " << out[i];
+      std::cout << "\\n";
+    }
+  }//for n_kern
+}//for b
+Py_XDECREF(img2d);
+Py_XDECREF(filtersflipped);
+"""
--- a/theano/sandbox/test_conv.py
+++ b/theano/sandbox/test_conv.py
@@ -41,7 +41,7 @@ def flip(kern, kshp):
 global_rng = N.random.RandomState(3423489)

 dmatrix4=T.TensorType('float64', (False, False, False, False))
-def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, conv_op_py=False, do_convolve2=False, do_print=True, repeat=1):
+def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, conv_op_py=False, do_convolve2=False, do_print=True, repeat=1, unroll_patch=0):

        # build actual input images
        imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2])
@@ -121,7 +121,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
                hidval1=outval.copy()

            # ConvOp
-            conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
+            conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern, unroll_patch=unroll_patch)(inputs4, kerns4)
            l1shp=N.hstack((nkern,
                            getFilterOutShp(imshp, kshp, ss, conv_mode)))
            propup2 = function([inputs4, kerns4], conv_op)
@@ -328,7 +328,7 @@ class TestConvOp(unittest.TestCase):
        ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
        convmodes = ['valid','full']
        do_convolve2=True
-        unroll = [(0,0),(1,1),(2,2),(3,2)]#(batch,kern)
+        unroll = [(0,0,False),(0,0,True),(1,1,False),(2,2,False),(3,2,False)]#(batch,kern,patch)
        do_speed_test = False

        # TODO: this version show a bug that was fixed
@@ -338,6 +338,11 @@ class TestConvOp(unittest.TestCase):
 #        nkerns = [2,2] # per output pixel
 #        ssizes = [(1,1),(2,2)]#2,2)]

+#        bsizes = [1,1] # batch size
+#        imshp_starts = [(1,10,10),(1,5,6)]
+#        kshpss = ([[2,3],[3,2]],[[2,2],[2,2]])
+#        nkernss = [[1,1],[1,1]] # per output pixel
+
        N.set_printoptions(threshold=N.nan)

        # symbolic stuff
@@ -356,8 +361,8 @@ class TestConvOp(unittest.TestCase):

            unroll_batch = [1,2,4,5,10,20]
            unroll_kern = [1,2,4,5,10,20]
-            unroll_batch = [1,2,5]
-            unroll_kern = [1,2,5]
+            unroll_batch = [1,4,5]
+            unroll_kern = [1,4,5]
            
            bsize = 20 # batch size
            imshp_start = (1,48,48)#un square shape to test more corner case.
@@ -374,46 +379,86 @@ class TestConvOp(unittest.TestCase):
            timing = N.zeros((len(unroll_batch),len(unroll_kern),3))
            t_b_k=[]
            #calculate the timing with unrolling
+
+            t_=[[ 7.60572791,  3.95069814,  3.74271464], [ 4.05631089,  2.90384555,  2.93613672], [ 3.90551591,  2.92595196,  3.00102282]]
+            best=[]
+            worst=[]
+            best=[0.52690219879150391, 2.4266397953033447]
+            worst=[0.92042708396911621, 6.8822150230407715]
+            t_=[]
            for unroll_b, n_b in zip(unroll_batch,range(len(unroll_batch))):
                for unroll_k, n_k in zip(unroll_kern,range(len(unroll_kern))):
                    t_b_k.append(str(unroll_b)+"/"+str(unroll_k))
-                    tctot, tpytot, ntot=[],[],[]
-                    for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
-                        for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                            tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate)
-                            tctot+=[tctot_]
-                            tpytot+=[tpytot_]
-                            ntot+=[ntot_]
-                    timing[n_b,n_k]=[sum(tctot), sum(tpytot), sum(ntot)]
-
+                    if not t_:
+                        tctot, tpytot, ntot=[],[],[]
+                        for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
+                            for ss, n_ss in zip(ssizes,range(len(ssizes))):
+                                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate)
+                                tctot+=[tctot_]
+                                tpytot+=[tpytot_]
+                                ntot+=[ntot_]
+                        if unroll_b==4 and unroll_k==4:
+                            print "unroll 4/4",tctot
+                            best=tctot
+                        if unroll_b==1 and unroll_k==1:
+                            print "unroll 1/1",tctot
+                            worst=tctot
+                        timing[n_b,n_k]=[sum(tctot), sum(tpytot), sum(ntot)]
+            if not t_:
+                t=timing[:,:,0]#We select only the c timing.
+            else:
+                t=t_
+            t=N.asarray(t)
            #calculate the old timing
-            tctot,tpytot,ntot=0,0,0
-            for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
-                for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                    tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate)
-                    tctot+=tctot_
-                    tpytot+=tpytot_
-                    ntot+=ntot_
-            print "old code timing %.3fs"%tctot
-
-#            print timing
-            t=timing[:,:,0]#We select only the c timing.
+            tctot_=[0.52555489540100098, 6.6634182929992676]
+#            tctot_=[]
+            tctot,tpytot,ntot=[],[],[]
+            if not tctot_:
+                for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
+                    for ss, n_ss in zip(ssizes,range(len(ssizes))):
+                        tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate)
+                        tctot+=[tctot_]
+                        tpytot+=[tpytot_]
+                        ntot+=[ntot_]
+            else: tctot=N.asarray(tctot_)
+            print "old code timing %.3fs"%sum(tctot),tctot
+            best=N.asarray(best)
+            worst=N.asarray(worst)
            print "timing for unrolled version"
            print t_b_k
            print t
            print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]
            print "min %.3fs"%t.min(), "min param(batch unloop size/kernel unloop size)", t_b_k[t.argmin()]
-            print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t.min(),tctot/t.min())
+            print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t.min(),sum(tctot)/t.min())
+            print worst/best,tctot/best
+
+            tctot_patch = []
+            for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
+                for ss, n_ss in zip(ssizes,range(len(ssizes))):
+                     tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=2)
+                     tctot_patch += [tctot_]
+
+            t_patch=sum(tctot_patch)
+            print "unroll_patch time", tctot_patch
+            print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t_patch,sum(tctot)/t_patch)
+            print best/tctot_patch, worst/tctot_patch
+            
+            print best
+            print worst
+            print tctot
+            print tctot_patch
            return
+
        
        for i in range(len(kshpss)):
            for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
                for ss, n_ss in zip(ssizess[i],range(len(ssizess[i]))):
-                    for un_b, un_k in unroll:
+                    for un_b, un_k, un_p in unroll:
                        tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
                            conv_mode, ss, bsizes[i], imshp_starts[i], 
                            kshpss[i], nkernss[i],
                            img=img, unroll_batch=un_b, unroll_kern=un_k,
+                            unroll_patch=un_p,
                            validate=True)
                        tctot+=[tctot_]
                        tpytot+=[tpytot_]
@@ -428,6 +473,11 @@ class TestConvOp(unittest.TestCase):
        d=N.asarray(ntot)/tpytot
        print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d

+
+    def init_data(self,shape):
+        return N.ones(shape)
+        return N.random.random(shape)
+        
    def test_ConvOpGrad(self):
        """
        test the gradient in float and double
@@ -442,9 +492,9 @@ class TestConvOp(unittest.TestCase):
        kshps = [(2,3)]
        imshps = [(2,3,4)]
        modes = ['valid', 'full']
-        unroll = [(0,0),(1,1),(2,3)]
+        unroll = [(0,0,True),(1,1,False),(2,3,False),(1,1,False),(0,0,False)]#(batch,kern,patch)
        ssizes = [(1,1),(2,2)]
-        
+
        for typ in types:
            imgs  = T.TensorType(typ, (False, False, False, False),'imgs')
            kerns = T.TensorType(typ, (False, False, False, False),'kerns')
@@ -457,12 +507,12 @@ class TestConvOp(unittest.TestCase):
                    imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype)
                    for kshp in kshps:
                        t=numpy.array([imshp[1]-kshp[0],imshp[2]-kshp[1]])
-                        kernvals = N.array(N.random.rand(nkern,visdim,kshp[0],
-                                                         kshp[1]),dtype=kerns.dtype)
+                        kernvals = N.array(self.init_data((nkern,visdim,kshp[0],
+                                                          kshp[1])),dtype=kerns.dtype)
                        # 'full' mode should support kernels bigger than the input
                        if mode == 'valid' and (t<0).any():
                            continue
-                        for un_b,un_k in unroll:
+                        for un_b,un_k, un_p in unroll:
                                for ss in ssizes:
                                    print 'test_ConvOpGrad'
                                    print 'mode type:', mode, typ
@@ -476,14 +526,14 @@ class TestConvOp(unittest.TestCase):

                                    def test_i(imgs):
                                        convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
-                                                        output_mode=mode, unroll_batch=un_b, unroll_kern=un_k)
+                                                        output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
                                        return convop(imgs, kernvals)

                                    def test_k(kerns):
                                        convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
-                                                        output_mode=mode, unroll_batch=un_b, unroll_kern=un_k)
+                                                        output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
                                        return convop(imgvals, kerns)
-
+                                    print mode, imshp, kshp, un_b, un_k, ss
                                    #TODO the tolerance needed to pass is very high for float32(0.17). Is this acceptable? Expected?
 				    tol = None
 				    if typ=="float32":