Merged

4cc3b5f2 · Olivier Delalleau · 4ce2c854 · 195e49c7 · 4cc3b5f2 · 4cc3b5f2
--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -704,12 +704,18 @@ class CLinker(link.Linker):
                instantiate.customize.add_support_code(self.struct_code)
                instantiate.customize.add_support_code(static)
                for extra_arg in (
-                        "-O2", 
+                        "-O3", 
+#                        "-fno-signaling-nans",
+#"-fno-finite-math-only",
+#"-fmath-errno", "-fno-unsafe-math-optimizations", "-fno-finite-math-only", "-frounding-math", "-fsignaling-nans","-fno-cx-limited-range","-fno-fast-math",
                        "-ffast-math",
+#"-fno-finite-math-only",
+#                        "-fno-signaling-nans",
+#"-fmath-errno", "-fno-unsafe-math-optimizations", "-fno-finite-math-only", "-frounding-math", "-fsignaling-nans","-fno-cx-limited-range","-fno-fast-math",
                        #"-fprefetch-loop-arrays",
                        #"-ftree-vect-loop-version",
                        #"-ftree-loop-optimize",
-                        #"-ftree-vectorize"):
+                        #"-ftree-vectorize",
                        "-w" #-w means supress all warnings
                        ):
                    instantiate.customize.add_extra_compile_arg(extra_arg)

--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -8,7 +8,7 @@ def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
    s = -1 if mode=='valid' else 1
    inshp, kshp = N.array(inshp), N.array(kshp)
    return  N.int64(N.ceil((inshp[1:] + s*kshp - s*1)/\
-            N.array([dy,dx], dtype='float')))
+            N.array([dx,dy], dtype='float')))

 class ConvOp(Op):
    """
@@ -44,20 +44,19 @@ class ConvOp(Op):
        self.unroll_kern=unroll_kern

        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
-            if self.bsize<self.unroll_batch:
+            if self.bsize<=self.unroll_batch:
                self.unroll_batch = self.bsize
            else:
-                self.unroll_batch=1
                print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%s) must be 0 or a multiple of bsize(%s). We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_batch),str(self.bsize))
+                self.unroll_batch=1
        if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
-            if self.nkern<self.unroll_kern:
+            if self.nkern<=self.unroll_kern:
                self.unroll_kern = self.nkern
            else:
-                self.unroll_kern=1
                print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%s) should be 0 or a multiple of nkern(%s)We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_kern),str(self.nkern))
-        if self.dx!=1 or self.dy!=1:
-            print "Warning, dx!=1 or dy!=1 only supported in python mode!"
-            raise NotImplementedError()
+                self.unroll_kern=1
+        if (self.dx!=1 or self.dy!=1):
+            print "WARNING: dx(%d)!=1 or dy(%d)!=1. The gradient is not implemented for those case."
        self.outshp = getFilterOutShp(self.imshp, kshp, (dx,dy), output_mode)
        self.out_mode = output_mode
        if not self.out_mode in ["valid", "full"]:
@@ -92,7 +91,7 @@ class ConvOp(Op):
            raise Exception("The image and the kernel must have the same type."
                            "inputs(%s), kerns(%s)"%(inputs.dtype, kerns.dtype))
        output = tensor.tensor(dtype=inputs.type.dtype,
-                               broadcastable=[False]*outdim, 
+                               broadcastable=[False]*outdim,
                               name="ConvOp_Output");

        return gof.Apply(self, [inputs, kerns], [output])
@@ -131,7 +130,9 @@ class ConvOp(Op):
        * inputs needs to be a 4D tensor. Couldn't get 3D to work
        * will crash if filter the same size as input image
        """
-
+        if self.dx!=1 or self.dy!=1:
+            raise NotImplementedError("I don't know how to implement the grad when dx!=1 or dy!=1! Is this possible?")
+        
        ####### Determine gradient on kernels ########
        if inputs.ndim == 3:
            inputs = tensor.shape_padleft(inputs,1)
@@ -145,25 +146,27 @@ class ConvOp(Op):
            (bsize, nkern) = (self.imshp[0], self.nkern)
            imshp = N.hstack((self.bsize, self.imshp[1:]))
            kshp  = self.outshp
+            un_b = self.unroll_batch
+            un_k = self.unroll_kern
        elif self.out_mode == 'full':
            (img, filters) = (newgz, newin)
            (bsize, nkern) = (self.nkern, self.imshp[0])
            imshp = N.hstack((self.bsize, self.outshp))
            kshp  = self.imshp[1:]
+            un_b = self.unroll_kern
+            un_k = self.unroll_batch
        else:
            raise NotImplementedError('Only [full,valid] modes are currently supported.')

        filters = filters[:,:,::-1,::-1]
        
        #find good value for the unroll
-        un_b = self.unroll_batch
-        un_k = self.unroll_kern
        if un_b!=0 and bsize%un_b!=0:
            if bsize<un_b:
                un_b = bsize
            else:
                un_b = 1
-                print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine a good unroll value for the batch. Maybe you can optimize this!"
+                print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine a good unroll value for the batch. Maybe you can optimize this!", bsize, un_b, self.unroll_batch, self.unroll_kern
        if un_k!=0 and nkern%un_k!=0:
            if nkern<un_k:
                un_k = nkern
@@ -238,7 +241,7 @@ using namespace std;
                                                   self.unroll_kern)

        #TODO: should we choose the unroll size automatically with the bigger divisor under 5? 
-        if self.out_mode == 'valid':
+        if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
 #            print "return gemm version"
            return _conv_op_code_valid_gemm % d
        else:
@@ -388,8 +391,11 @@ if ((!%(z)s)
 }

 int Os[2];
-if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
-else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
+Os[0]=%(self_outshp0)s;
+Os[1]=%(self_outshp1)s;
+//I keep the formula to calculte Os in case we need it in the futur.
+//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
+//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}

 for(int b=0;b< %(self_bsize)s;b++){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
@@ -410,12 +416,14 @@ for(int b=0;b< %(self_bsize)s;b++){

      int new_m;

-      for (int m=0; m < Os[0]; m++) {
+      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
        // Reposition index into input image based on requested output size
-        if (mode == FULL) new_m = m ;
-        else new_m = (m+dim_ker[0]-1);
+        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
+        if (mode == FULL) new_m = pos_m ;
+        else new_m = (pos_m+dim_ker[0]-1);

-        for (int n=0; n < Os[1]; n++) {  // loop over columns 
+        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
+          int pos_n=iter_n*%(self_dy)s;
          %(type)s sum=0;

          // Sum over kernel, if index into image is out of bounds
@@ -433,7 +441,7 @@ for(int b=0;b< %(self_bsize)s;b++){
              }else{
                //do the part where kernel is to the right of the img

-                int k=0,max_k=max((int)(n-dim_im[1])+1,0);
+                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
                if(fill_value!=0){ 
                
                  for(k=0;k<max_k;k++){
@@ -442,9 +450,9 @@ for(int b=0;b< %(self_bsize)s;b++){
                }else {k=max_k;}
                
                //do the part where the kernel is on the img
-                max_k=min(n+1,(int)dim_ker[1]);
+                max_k=min(pos_n+1,(int)dim_ker[1]);
                const %(type)s * idx_in=&in[ind0*dim_im[1]];
-                for (int ind1=n-k; k<max_k; k++,ind1--) {
+                for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
                  sum+= idx_hvals[k] * idx_in[ind1];
                }
                //do the part to the left of the img
@@ -454,14 +462,13 @@ for(int b=0;b< %(self_bsize)s;b++){
            }else{
              const %(type)s* idx_in=&in[ind0*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
              const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
-              int new_n = (n+dim_ker[1]-1);
-
+              int new_n = (pos_n+dim_ker[1]-1);
              for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
                sum+=idx_hvals[k]*idx_in[last];
              }
            }
          }//for j
-          out[m*dim_zz[1]+n] %(affectation)s sum;
+          out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
        }//for n
      }//for m
    }//for stack_size
@@ -763,7 +770,11 @@ if(%(img2d)s->nd==2){
  img2d_dim[1]=%(img2d)s->dimensions[1];
  img2d_dim[0]=%(img2d)s->dimensions[0];
 }else {
-    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
+    std:stringstream temp;
+    temp << "nddim="<<%(img2d)s->nd;
+    std::string param = temp.str();
+    PyErr_SetString(PyExc_ValueError,
+      ("img don't have a good shape. " + param).c_str());
    %(fail)s;
 }

@@ -777,11 +788,7 @@ if(%(filtersflipped)s->nd==3){
  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
 }else{
-    std:stringstream temp;
-    temp << "nddim="<<%(filtersflipped)s->nd;
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("kernel don't have a good shape. " + param).c_str());
+    PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
    %(fail)s;
 }

@@ -844,8 +851,12 @@ if ((!%(z)s)
 }

 int Os[2];
-if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
-else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
+Os[0]=%(self_outshp0)s;
+Os[1]=%(self_outshp1)s;
+//I keep the formula to calculte Os in case we need it in the futur.
+//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
+//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
+
 for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){

@@ -866,12 +877,14 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){

      int new_m;

-      for (int m=0; m < Os[0]; m++) {
+      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
        // Reposition index into input image based on requested output size
-        if (mode == FULL) new_m = m ;
-        else new_m = (m+dim_ker[0]-1);
+        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
+        if (mode == FULL) new_m = pos_m ;
+        else new_m = (pos_m+dim_ker[0]-1);

-        for (int n=0; n < Os[1]; n++) {  // loop over columns 
+        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns 
+          int pos_n=iter_n*%(self_dy)s;
        """%d
    ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
    ret+="""
@@ -895,7 +908,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
              }else{
                //do the part where kernel is to the right of the img

-                int k=0,max_k=max((int)(n-dim_im[1])+1,0);
+                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
                if(fill_value!=0){ 
                
                  for(k=0;k<max_k;k++){
@@ -906,11 +919,11 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
                }else {k=max_k;}
                
                //do the part where the kernel is on the img
-                max_k=min(n+1,(int)dim_ker[1]);
+                max_k=min(pos_n+1,(int)dim_ker[1]);
 """%d
    ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
    ret+="""
-                for (int ind1=n-k; k<max_k; k++,ind1--) {
+                for (int ind1=pos_n-k; k<max_k; k++,ind1--) {

 """%d
    ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
@@ -929,7 +942,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
    ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
    ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
    ret+="""
-              int new_n = (n+dim_ker[1]-1);
+              int new_n = (pos_n+dim_ker[1]-1);

              for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
 """%d
@@ -940,7 +953,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){

          }//for j
 """%d
-    ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
+    ret+=my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
    ret+="""
        }//for n
      }//for m

--- a/theano/sandbox/test_conv.py
+++ b/theano/sandbox/test_conv.py
@@ -90,16 +90,18 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
                ####### test with new sp.convolve2 function ######
                time1 = time.time()
                hid, outshp2 = convolve2(kern, kshp, nkern, img, imshp,  
-                                         bsize, (1,1), mode=conv_mode)
+                                         bsize, (ss[0],ss[1]), mode=conv_mode)
                propup = function([kern, img], hid)
                propup1 = function([kern, img], hid,mode=Mode(linker="py"))

                hidval  = propup(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1))
-                hidval  = hidval.reshape(bsize,nkern,outshp2[-2],outshp2[-1])[:,:,::ss[0],::ss[1]]
+                hidval  = hidval.reshape(bsize,nkern,outshp2[-2],outshp2[-1])
+#                hidval = hidval[:,:,::ss[0],::ss[1]]
                hidval = hidval.reshape(bsize, -1)
                for i in range(repeat):
                    hidval1 = propup1(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1))
-                hidval1  = hidval1.reshape(bsize,nkern,outshp2[-2],outshp2[-1])[:,:,::ss[0],::ss[1]]
+                hidval1  = hidval1.reshape(bsize,nkern,outshp2[-2],outshp2[-1])
+#                hidval1  = hidval1[:,:,::ss[0],::ss[1]]
                hidval1 = hidval1.reshape(bsize, -1)

                assert (N.abs(hidval-hidval1)<1e-5).all()
@@ -113,7 +115,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
                hidval1=outval.copy()

            # ConvOp
-            conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
+            conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
            l1shp=N.hstack((nkern,
                            getFilterOutShp(imshp, kshp, ss, conv_mode)))
            propup2 = function([inputs4, kerns4], conv_op)
@@ -122,14 +124,14 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
            time1 = time.time()
            for i in range(repeat):
                hidval2_ = propup2(imgval,w_flip)
-            hidval2 = hidval2_[:,:,0::ss[0],0::ss[1]]
+            hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
            tctot += time.time() - time1

            if conv_op_py:
                time1 = time.time()
                for i in range(repeat):
                    hidval3_ = propup3(imgval,w_flip)
-                hidval3 = hidval3_[:,:,0::ss[0],0::ss[1]]
+                hidval3 = hidval3_#[:,:,0::ss[0],0::ss[1]]
                tpytot += time.time() - time1
                assert (N.abs(hidval2-hidval3)<1e-5).all()
            else:
@@ -235,7 +237,7 @@ class TestConvOp(unittest.TestCase):

                    # compute with new convolve2 (no timing info)
                    output4, outshp4  = convolve2(kerns, kshp, nkern, input,\
-                            imshp, bsize, (1,1), bias=bias, mode=conv_mode)
+                            imshp, bsize, (ss[0],ss[1]), bias=bias, mode=conv_mode)
 #                    print 'output4', output4

                    ttime1 = time.time()
@@ -244,7 +246,7 @@ class TestConvOp(unittest.TestCase):
 #                    print 'out4', out4, img1d, filtersflipped
                    tconv2 += [time.time() - ttime1]
                    out4 = out4.reshape(bsize, nkern, outshp4[1], outshp4[2])
-                    out4 = out4[:,:,0::ss[0],0::ss[1]]
+                    out4 = out4#[:,:,0::ss[0],0::ss[1]]
                    out4 = out4.reshape(bsize, -1)

                    # compute with ConvOp
@@ -252,18 +254,18 @@ class TestConvOp(unittest.TestCase):
                    inputs=dmatrix3()
                    kerns3=dmatrix3()
                    bia=T.dscalar()
-                    conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode)(inputs, kerns3)
+                    conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode)(inputs, kerns3)
                    f2 = function([inputs, kerns3], conv_op, mode=Mode(linker="c"))
                    f3 = function([inputs, kerns3], conv_op, mode=Mode(linker="py"))

                    ttime1 = time.time()
                    out2_ = f2(img2d, filtersflipped)
-                    out2__ = out2_[:,:,0::ss[0],0::ss[1]]
+                    out2__ = out2_#[:,:,0::ss[0],0::ss[1]]
                    tconvop += [time.time() - ttime1]
                    out2___ = out2__.copy()
                    out2 = out2___ + biasvals.reshape(1,nkern,1,1)
                    out3_ = f3(img2d, filtersflipped)
-                    out3__ = out3_[:,:,0::ss[0],0::ss[1]]
+                    out3__ = out3_#[:,:,0::ss[0],0::ss[1]]
                    out3___ = out3__.copy()
                    out3 = out3___ + biasvals.reshape(1,nkern,1,1)
                    assert (N.abs(out2_-out3_)<1e-5).all()
@@ -302,15 +304,21 @@ class TestConvOp(unittest.TestCase):
        print 'speed up ConvOp vs convolve2d: %.3f'%d.mean(),d

    def test_multilayer_conv(self):
+        print '\n\n*************************************************'
+        print '           TEST MULTILAYER CONVOLUTION' 
+        print '*************************************************'
+
        # fixed parameters
+        # test multiple configuration at the same time
        bsizes = [6,6] # batch size
-        imshp_starts = [(1,28,28),(1,4,4)]
+        imshp_starts = [(1,13,14),(1,4,5)]
        kshpss = ([[5,6],[7,4]],[[2,2],[2,2]])
        nkernss = [[20,40],[2,2]] # per output pixel
-        ssizess = [[(1,1),(2,2)],[(1,1),(2,2)]]
+        ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
        convmodes = ['valid','full']
        do_convolve2=True
        unroll = [(0,0),(1,1),(2,2),(3,2)]#(batch,kern)
+        do_speed_test = False

        # TODO: this version show a bug that was fixed
        # the test is included in the upper test.
@@ -319,15 +327,6 @@ class TestConvOp(unittest.TestCase):
 #        nkerns = [2,2] # per output pixel
 #        ssizes = [(1,1),(2,2)]#2,2)]

-        #test speed
-#        bsize = 10 # batch size
-#        imshp_start = (1,50,49)#un square shape to test more corner case.
-#        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-#        nkerns = [20,20] # per output pixel
-#        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-#        convmodes = ['valid','full']
-#        do_convolve2=False
-
        N.set_printoptions(threshold=N.nan)

        # symbolic stuff
@@ -338,7 +337,7 @@ class TestConvOp(unittest.TestCase):
        for i in range(len(kshpss)):
            assert len(kshpss[i])==len(nkernss[i])==len(kerns)

-        if False:
+        if do_speed_test:
            # calculate the speed up of different combination of unroll
            # put the paramter to the same you will try. 
            
@@ -418,16 +417,19 @@ class TestConvOp(unittest.TestCase):
        d=N.asarray(ntot)/tpytot
        print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d

-
    def test_ConvOpGrad(self):
        """
        test the gradient in float and double
        """
+        print '\n\n*************************************************'
+        print '           TEST ConvOp.grad' 
+        print '*************************************************'
+
        nkern = 4
        bsize = 3
        types = ["float32", "float64"]
        kshps = [(5,5), (6,7)]
-        imshps = [(1,5,5), (2,8,8), (3,8,7)]
+        imshps = [(1,5,5), (2,8,7)]
        modes = ['valid', 'full']
        unroll_batch=[0,1,3]
        unroll_kern=[0,1,4]
@@ -468,19 +470,22 @@ class TestConvOp(unittest.TestCase):
                                                tol=None if typ!="float32" else 0.16)

 if __name__ == '__main__':
-#    t = TestConvOp("test_convolution")
+    t = TestConvOp("test_convolution")
 #    t.test_convolution()
-#    t.test_multilayer_conv()
+    t.test_multilayer_conv()
 #    from theano.tests import main
 #    main("test_sp")
-    bsize = 20 # batch size
-    imshp_start = (1,100,100)#un square shape to test more corner case.
-    kshps = ([11,12],[12,11])#un square shape to test more corner case.
-    nkerns = [20,20] # per output pixel
-    ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-    convmodes = ['valid','full']
-    unroll_batch = 5
-    unroll_kern = 2
-    ctot=0
-    tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
-    print "total exec time %.3fs"%tctot
+    if False:
+        #used to lanch 8 jobs at the same time.
+        bsize = 20 # batch size
+        imshp_start = (1,100,100)#un square shape to test more corner case.
+        kshps = ([11,12],[12,11])#un square shape to test more corner case.
+        nkerns = [20,20] # per output pixel
+        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
+        convmodes = ['valid','full']
+        unroll_batch = 5
+        unroll_kern = 2
+        ctot=0
+        tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
+        print "total exec time %.3fs"%tctot
+        
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -30,7 +30,7 @@ _mtypes = [sparse.csc_matrix, sparse.csr_matrix]
 _mtype_to_str = {sparse.csc_matrix: "csc", sparse.csr_matrix: "csr"}

 import scipy
-if scipy.__version__ != '0.7.0':
+if not scipy.__version__.startswith('0.7.'):
    sys.stderr.write("WARNING: scipy version = %s. We prefer version >=0.7.0 because it has bugs fixed in the sparse matrix code.\n" % scipy.__version__)

 def _is_sparse_variable(x):

--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -764,8 +764,10 @@ class CrossentropyCategorical1Hot(gof.Op):
        _true_one_of_n = tensor.as_tensor_variable(true_one_of_n)
        if _coding_dist.type.ndim != 2:
            raise TypeError('matrix required for argument: coding_dist')
-        if _true_one_of_n.type != tensor.lvector:
-            raise TypeError('integer vector required for argument: true_one_of_n')
+        if _true_one_of_n.type not in (tensor.lvector, tensor.ivector):
+            raise TypeError('integer vector required for argument: true_one_of_n'
+                    '(got type: %s instead of: %s)' % (_true_one_of_n.type,
+                        tensor.lvector))

        return gof.Apply(self, [_coding_dist, _true_one_of_n], [tensor.dvector()])