* moved tensor/nnet.py to tensor/nnet/nnet.py

* moved sandbox/conv.py to tensor/nnet/conv.py * created tensor/signal/conv.py which implements generic 2D convolution (i.e. traditional signal processing definition) + tests * added tensor/nnet/tests/speed_test_conv.py which contains Fred's speed tests for ConvOp * added appropriate documentation to sphinx library docs

* moved tensor/nnet.py to tensor/nnet/nnet.py
0923873d · gdesjardins · 3dac2fd1 · 0923873d · 0923873d · 0923873d
--- a/doc/library/tensor/index.txt
+++ b/doc/library/tensor/index.txt
@@ -18,8 +18,7 @@ They are grouped into the following sections:
    :maxdepth: 1

    basic
+    nnet/index
    raw_random
    shared_randomstreams
-    nnet
-    signal
-
+    signal/index
--- a/doc/library/tensor/signal.txt
+++ b/doc/library/tensor/signal.txt
-.. _libdoc_tensor_signal:
+.. _libdoc_tensor_nnet_conv:

-======================================================
-:mod:`signal` -- Signal processing
-======================================================
+==========================================================
+:mod:`conv` -- Ops for convolutional neural nets
+==========================================================

-.. module:: signal
+.. module:: conv
   :platform: Unix, Windows
   :synopsis: ops for signal processing
 .. moduleauthor:: LISA
@@ -12,9 +12,7 @@

 TODO: Give examples for how to use these things! They are pretty complicated.

-.. function:: conv2D(*todo)
-
-.. function:: downsample2D(*todo)
+.. autofunction:: theano.tensor.nnet.conv.conv2d

 .. function:: fft(*todo)


--- a/doc/library/tensor/nnet/index.txt
+++ b/doc/library/tensor/nnet/index.txt
+.. _libdoc_tensor_nnet:
+
+==================================================
+:mod:`nnet`  -- Ops related to neural networks
+==================================================
+
+.. module:: nnet
+   :platform: Unix, Windows
+   :synopsis: various ops relating to neural networks
+.. moduleauthor:: LISA
+
+Theano was originally developped for machine learning applications, particularly
+for the topic of deep learning. As such, our lab has developed many functions
+and ops which are particular to neural networks and deep learning.
+
+.. toctree::
+    :maxdepth: 1
+
+    conv
+    nnet
--- a/doc/library/tensor/nnet.txt
+++ b/doc/library/tensor/nnet.txt
-.. _libdoc_tensor_nnet:
+.. _libdoc_tensor_nnet_nnet:

 ======================================================
 :mod:`nnet` -- Ops for neural networks

--- a/doc/library/tensor/signal/conv.txt
+++ b/doc/library/tensor/signal/conv.txt
+.. _libdoc_tensor_signal_conv:
+
+======================================================
+:mod:`conv` -- Convolution
+======================================================
+
+.. module:: conv
+   :platform: Unix, Windows
+   :synopsis: ops for performing convolutions
+.. moduleauthor:: LISA
+
+.. autofunction:: theano.tensor.signal.conv.conv2d
+
+.. function:: fft(*todo)
+
+    [James has some code for this, but hasn't gotten it into the source tree yet.]
+
--- a/doc/library/tensor/signal/downsample.txt
+++ b/doc/library/tensor/signal/downsample.txt
+.. _libdoc_tensor_signal_downsample:
+
+======================================================
+:mod:`downsample` -- Down-Sampling
+======================================================
+
+.. module:: downsample
+   :platform: Unix, Windows
+   :synopsis: ops for performing various forms of downsampling
+.. moduleauthor:: LISA
+
+
+.. autofunction:: theano.tensor.signal.downsample.max_pool2D
+
+.. function:: fft(*todo)
+
+    [James has some code for this, but hasn't gotten it into the source tree yet.]
+
--- a/doc/library/tensor/signal/index.txt
+++ b/doc/library/tensor/signal/index.txt
+.. _libdoc_tensor:
+
+=====================================================
+:mod:`signal` -- Signal Processing
+=====================================================
+
+Signal Processing
+-----------------
+
+.. module:: signal
+   :platform: Unix, Windows
+   :synopsis: various ops for performing basic signal processing
+   (convolutions, subsampling, fft, etc.)
+.. moduleauthor:: LISA
+
+The signal subpackage contains ops which are useful for performing various
+forms of signal processing.
+
+.. toctree::
+    :maxdepth: 1
+
+    conv
+    downsample
--- a/theano/sandbox/test_conv.py
+++ b/theano/sandbox/test_conv.py
-import sys, time, unittest
-
-import numpy
-import numpy as N
-
-from theano.tests import unittest_tools as utt
-
-from theano import function, Mode
-import theano.tensor as T
-from conv import ConvOp, getFilterOutShp
-
-def flip(kern, kshp):
-    "flip the kernel as scipy.convolv2d do it flipped."
-    flip = N.zeros(kern.shape)
-    if len(kern.shape)==2:
-        kern=kern.reshape(-1)
-        it = reversed(kern)
-        for i in range(kshp[0]):
-            for j in range(kshp[1]):
-                flip[i,j] = it.next()
-    elif len(kern.shape)==3:
-        kern=kern.reshape(kern.shape[0],-1)
-        for k in range(kern.shape[0]):
-            it = reversed(kern[k,:])
-            for i in range(kshp[0]):
-                for j in range(kshp[1]):
-                    flip[k,i,j] = it.next()
-    elif len(kern.shape)==4:
-        kern=kern.reshape(kern.shape[0],kern.shape[1],-1)
-        for k in range(kern.shape[0]):
-            for m in range(kern.shape[1]):
-                it = reversed(kern[k,m,:])
-                for i in range(kshp[0]):
-                    for j in range(kshp[1]):
-                        flip[k,m,i,j] = it.next()
-    else:
-        raise NotImplementedError()
-    
-    return flip
-
-global_rng = N.random.RandomState(3423489)
-
-dmatrix4=T.TensorType('float64', (False, False, False, False))
-def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, conv_op_py=False, do_print=True, repeat=1, unroll_patch=False, unroll_patch_size=False, verbose=0):
-
-        # build actual input images
-        imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2])
-
-        a=T.dmatrix()
-        kerns = [a for i in nkerns]
-        inputs4=dmatrix4()
-        kerns4=dmatrix4()
-
-        # for each layer
-        ntot=0 
-        tctot=0
-        tpytot=0
-
-        for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))):
-            if do_print:
-                print '************* layer %i ***************' % n_layer
-                
-                print conv_mode, ss, n_layer, kshp, nkern
-
-            # actual values
-            w = global_rng.random_sample(N.r_[nkern,imshp[0],kshp])
-            w_flip = flip(w,kshp).reshape(w.shape)
-
-            ## manual implementation
-            # check first stage
-            padimg = imgval
-            if conv_mode == 'full':
-                padimg_shp = N.array(imshp[1:]) + 2*(N.array(kshp) - N.array([1,1]))
-                padimg = N.zeros(N.r_[bsize,imshp[0],padimg_shp])
-                padimg[:, :, kshp[0]-1:-kshp[0]+1, 
-                             kshp[1]-1:-kshp[1]+1] = imgval
-
-            outshp = N.hstack((nkern, getFilterOutShp(imshp, kshp, ss, conv_mode)))
-
-            time1 = time.time()
-            outval = N.zeros(N.r_[bsize,outshp])
-            if validate:
-                # causes an atexit problem
-                from scipy.signal.sigtools import _convolve2d
-                from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
-                val = _valfrommode(conv_mode)
-                bval = _bvalfromboundary('fill')
-                for b in range(bsize): # loop over batches
-                    for n in range(nkern): # loop over filters
-                        for i in range(imshp[0]): # loop over input feature maps
-                            outval[b,n,...] +=  _convolve2d(\
-                                imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]]
-                ntot += time.time() - time1
-
-            # ConvOp
-            if unroll_patch and not unroll_patch_size:
-                conv_op = ConvOp(dx=ss[0],dy=ss[1], output_mode=conv_mode,
-                                 unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
-            else:
-                conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode,
-                                 unroll_batch=unroll_batch, unroll_kern=unroll_kern, unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
-            l1shp=N.hstack((nkern,
-                            getFilterOutShp(imshp, kshp, ss, conv_mode)))
-            propup2 = function([inputs4, kerns4], conv_op)
-            propup3 = function([inputs4, kerns4], conv_op, mode=Mode(linker="py"))
-
-            time1 = time.time()
-            for i in range(repeat):
-                hidval2_ = propup2(imgval,w_flip)
-            hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
-            tctot += time.time() - time1
-
-            if conv_op_py:
-                time1 = time.time()
-                for i in range(repeat):
-                    hidval3_ = propup3(imgval,w_flip)
-                hidval3 = hidval3_#[:,:,0::ss[0],0::ss[1]]
-                tpytot += time.time() - time1
-                assert (N.abs(hidval2-hidval3)<1e-5).all()
-            else:
-                tpytot += 0
-
-            if validate:
-                temp = N.abs(outval - hidval2)
-                assert (temp < 1e-5).all()
-            if validate and conv_op_py:
-                temp = N.abs(outval - hidval3)
-                assert (temp < 1e-5).all()
-
-            imshp = tuple(outshp)
-            imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2])
-
-        return tctot, tpytot, ntot
-
-
-
-class TestConvOp(unittest.TestCase):
-    """NOTE: we test only when we pass 4d tensor.
-    """
-
-    def setUp(self):
-        utt.seed_rng()
-
-    def test_convolution(self):
-        print '\n\n*************************************************'
-        print '           TEST CONVOLUTION' 
-        print '*************************************************'
-        from scipy.signal import convolve2d
-
-        if 0:
-            # fixed parameters
-            bsize = 10     # batch size
-            imshp = (1,28,28)# image shape
-            kshps = [(5,5),(6,7),(12,8)]   # kernel shaped
-            nkern = 5      # nb kernel
-            ssizes = ((1,1),(2,2),(3,3),(4,4))#step size
-            convmodes = ('full','valid')
-        elif 0:
-            # fixed parameters
-            bsize = 10     # batch size
-            imshp = (1,50,50)# image shape
-            print >> sys.stderr, "WARNING: only square shape tested"
-            kshps = [(12,12), (12,12)]
-            nkern = 20      # nb kernel
-            ssizes = [(1,1)] #step size
-            convmodes = ('full','valid')
-        elif 0:
-            # fixed parameters
-            bsize = 7     # batch size
-            imshp = (1,5,4)# image shape
-            kshps = [(2,3)]
-            nkern = 6      # nb kernel
-            ssizes = [(1,1)] #step size
-            convmodes = ('valid',)
-        else:
-            # fixed parameters
-            bsize = 7     # batch size
-            imshp = (1,5,4)# image shape
-            kshps = [(2,3)]
-            nkern = 6      # nb kernel
-            ssizes = [(1,1)] #step size
-            convmodes = ('valid',)
-      
-        # TODO: ask Fred about this
-        # this combination trigered a bug.
-        #        bsize=1
-        #        imshp=(1,9,9)#fail with 9,9
-        #        kshp=(2,2)
-        #        nkern=5
-        #        ssizes=((1,1),)
-        # this combination trigered a bug.
-        #        bsize = 1     # batch size
-        #        imshp = (1,3,3)# image shape
-        #        kshp = (2,3)#(5,5)   # kernel shaped
-        #        nkern = 1      # nb kernel
-        #        ssizes = ((1,1),)#(2,2),(3,3),(4,4))#step size
-        #        convmodes = ('full','valid')
-
-        # symbolic stuff
-        bias = T.dvector()
-        kerns = T.dmatrix()
-        input = T.dmatrix()
-        rng = N.random.RandomState(3423489)
-        biasvals = rng.randn(nkern)
-
-        #profmode = wraplinker.ProfileMode(OpWiseCLinker(), 'fast_run') 
-        tconvop, tscipy, tconv2 = [], [], []
-
-        for conv_mode in convmodes:
-            for kshp in kshps:
-
-                filters = rng.randn(nkern,N.prod(kshp))
-
-                for ss in ssizes:
-
-                    # now test with real values
-                    img2d = 1 + N.arange(bsize*N.prod(imshp)).reshape((bsize,)+imshp)
-#                    print 'img2d', img2d
-                    img1d = img2d.reshape(bsize,-1)
-
-                    # create filters
-                    filtersflipped = flip(filters.reshape((nkern,)+kshp), kshp)
-
-                    # compute with ConvOp
-                    dmatrix3=T.TensorType('float64', (False, False, False))
-                    inputs4=dmatrix4()
-                    kerns4=dmatrix4()
-                    bia=T.dscalar()
-                    conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode)(inputs4, kerns4)
-                    f2 = function([inputs4, kerns4], conv_op, mode=Mode(linker="c"))
-                    f3 = function([inputs4, kerns4], conv_op, mode=Mode(linker="py"))
-
-                    ttime1 = time.time()
-                    out2_ = f2(img2d, filtersflipped.reshape(nkern,1,*kshp))
-                    out2__ = out2_
-                    tconvop += [time.time() - ttime1]
-                    out2___ = out2__.copy()
-                    out2 = out2___ + biasvals.reshape(1,nkern,1,1)
-                    out3_ = f3(img2d, filtersflipped.reshape(nkern,1,*kshp))
-                    out3__ = out3_
-                    out3___ = out3__.copy()
-                    out3 = out3___ + biasvals.reshape(1,nkern,1,1)
-                    assert (N.abs(out2_-out3_)<1e-5).all()
-
-                    # REFERENCE IMPLEMENTATION: compute output with convolve2d
-		    if conv_mode=='valid':
-			fulloutshp = N.array(imshp[1:]) - N.array(kshp) + 1
-		    else:
-			fulloutshp = N.array(imshp[1:]) + N.array(kshp) - 1
-                    ntime1 = time.time()
-                    refout = N.zeros((bsize,)+tuple(fulloutshp)+(nkern,))
-                    for b in range(bsize):
-                        for n in range(nkern):
-                            refout[b,...,n] = convolve2d(\
-                                    img2d[b,0,:,:], filtersflipped[n,...],conv_mode)
-                    tscipy += [time.time() - ntime1]
-
-                    # need to flatten images
-                    bench1 = refout[:,0::ss[0],0::ss[1],:].reshape(bsize,-1,nkern)
-                    bench1 += biasvals.reshape(1,1,nkern)
-
-                    # swap the last two dimensions (output needs to be nkern x outshp)
-                    bench1 = N.swapaxes(bench1,1,2)
-
-                    # compare benchmark with ConvOp
-                    temp = bench1.flatten() - out2.flatten()
-                    assert (temp < 1e-5).all()
-                    
-        print '**** Convolution Profiling Results ****'
-        print 'Scipy convolve2d processing time: %.3fs'%sum(tscipy),tscipy
-        print 'ConvOp processing time: %.3fs'%sum(tconvop),tconvop
-        print 'convolve2 processing time: %.3fs'%sum(tconv2),tconv2
-
-        d=N.asarray(tscipy)/tconvop
-        print 'speed up ConvOp vs convolve2d: %.3f'%d.mean(),d
-
-    def speed_multilayer_conv(self):
-            # calculate the speed up of different combination of unroll
-            # put the paramter to the same you will try. 
-            
-            validate=False# we don't validate the result to have it much faster!
-            verbose=1
-            unroll_batch = [1,2,4,5,10,20]
-            unroll_kern = [1,2,4,5,10,20]
-            unroll_batch = [1,4,5]
-            unroll_kern = [1,4,5]
-            unroll_patch = [True, False]
-            
-            bsize = 20 # batch size
-            imshp_start = (1,48,48)#un square shape to test more corner case.
-            kshps = ([11,12],[12,11])#un square shape to test more corner case.
-            nkerns = [20,20] # per output pixel
-            ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-            convmodes = ['valid','full']
-            do_convolve2=False
-            a=T.dmatrix()
-            kerns = [a for i in nkerns]
-
-            assert len(kshps)==len(nkerns)==len(kerns)
-        
-            timing = N.zeros((len(unroll_batch),len(unroll_kern),3,len(convmodes)*len(ssizes)))
-            t_b_k=[]
-            #calculate the timing with unrolling
-
-            print 'time unroll batch kern'
-            t_=[[ 7.60572791,  3.95069814,  3.74271464], [ 4.05631089,  2.90384555,  2.93613672], [ 3.90551591,  2.92595196,  3.00102282]]
-            best=[0.52690219879150391, 2.4266397953033447]
-            worst=[0.92042708396911621, 6.8822150230407715]
-            best=[]
-            worst=[]
-            t_=[]
-            for unroll_b, n_b in zip(unroll_batch,range(len(unroll_batch))):
-                for unroll_k, n_k in zip(unroll_kern,range(len(unroll_kern))):
-                    t_b_k.append(str(unroll_b)+"/"+str(unroll_k))
-                    if not t_:
-                        tctot, tpytot, ntot=[],[],[]
-                        for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
-                            for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False)
-                                tctot+=[tctot_]
-                                tpytot+=[tpytot_]
-                                ntot+=[ntot_]
-                        if unroll_b==4 and unroll_k==4:
-                            #print "unroll 4/4",tctot
-                            best=tctot
-                        if unroll_b==1 and unroll_k==1:
-                            #print "unroll 1/1",tctot
-                            worst=tctot
-                        timing[n_b,n_k]=[tctot, tpytot, ntot]#[sum(tctot), sum(tpytot), sum(ntot)]
-            if not t_:
-                t=timing[:,:,0,:]#We select only the c timing.
-            else:
-                t=t_
-            t=N.asarray(t)
-            #calculate the old timing
-            print 'time old version'
-            tctot_=[0.52555489540100098, 6.6634182929992676]
-            tctot,tpytot,ntot=[],[],[]
-            tctot_=[]
-            if not tctot_:
-                for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
-                    for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                        tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False)
-                        tctot+=[tctot_]
-                        tpytot+=[tpytot_]
-                        ntot+=[ntot_]
-            else: tctot=N.asarray(tctot_)
-            print "old code timing %.3fs"%sum(tctot),tctot
-            best=N.asarray(best)
-            worst=N.asarray(worst)
-            print "timing for unrolled version"
-            print t_b_k
-            print t
-            t_detail=t
-            t = t.sum(axis=2)
-            print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]
-            print "min %.3fs"%t.min(), "min param(batch unloop size/kernel unloop size)", t_b_k[t.argmin()]
-            print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t.min(),sum(tctot)/t.min())
-            print worst/best,tctot/best
-
-            #calculate the timing of unroll_patch
-            print 'time unroll_patch'
-            tctot_patch = []
-            tctot_patch_size = []
-            for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
-                for ss, n_ss in zip(ssizes,range(len(ssizes))):
-                    tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False)
-                    tctot_patch += [tctot_]
-                    tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True)
-                    tctot_patch_size += [tctot_]
-
-            t_patch=sum(tctot_patch)
-            print "unroll_patch without shape time", tctot_patch
-            print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t_patch,sum(tctot)/t_patch)
-            print best/tctot_patch, worst/tctot_patch
-            t_patch_size=sum(tctot_patch_size)
-            print "unroll_patch with shape time", tctot_patch_size
-            print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t_patch_size,sum(tctot)/t_patch_size)
-            print best/tctot_patch_size, worst/tctot_patch_size
-            
-            return
-
-    def test_multilayer_conv(self):
-        print '\n\n*************************************************'
-        print '           TEST MULTILAYER CONVOLUTION' 
-        print '*************************************************'
-
-        # fixed parameters
-        # test multiple configuration at the same time
-        bsizes = [6,6] # batch size
-        imshp_starts = [(1,13,14),(1,4,5)]
-        kshpss = ([[5,6],[7,4]],[[2,2],[2,2]])
-        nkernss = [[20,40],[2,2]] # per output pixel
-        ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
-        convmodes = ['valid','full']
-        do_convolve2=True
-        unroll = [(0,0,True),(0,0,False),(1,1,False),(2,2,False),(3,2,False)]#(batch,kern,patch)
-
-        # TODO: this version show a bug that was fixed
-        # the test is included in the upper test.
-#        imshp_start = (1,4,4)
-#        kshps = ([2,2],[2,2])#,[7,4])
-#        nkerns = [2,2] # per output pixel
-#        ssizes = [(1,1),(2,2)]#2,2)]
-
-#        bsizes = [1,1] # batch size
-#        imshp_starts = [(1,10,10),(1,5,6)]
-#        kshpss = ([[2,3],[3,2]],[[2,2],[2,2]])
-#        nkernss = [[1,1],[1,1]] # per output pixel
-
-        N.set_printoptions(threshold=N.nan)
-
-        # symbolic stuff
-        kerns = [T.matrix(),T.dmatrix()]
-        img = T.dmatrix()
-        rng = N.random.RandomState(3423489)
-        tctot, tpytot, ntot = [], [], []
-        for i in range(len(kshpss)):
-            assert len(kshpss[i])==len(nkernss[i])==len(kerns)
-
-        for i in range(len(kshpss)):
-            for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
-                for ss, n_ss in zip(ssizess[i],range(len(ssizess[i]))):
-                    for un_b, un_k, un_p in unroll:
-                        tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
-                            conv_mode, ss, bsizes[i], imshp_starts[i], 
-                            kshpss[i], nkernss[i],
-                            img=img, unroll_batch=un_b, unroll_kern=un_k,
-                            unroll_patch=un_p,
-                            validate=True)
-                        tctot+=[tctot_]
-                        tpytot+=[tpytot_]
-                        ntot+=[ntot_]
-
-        print '**** Multilayer Convolution Profiling Results ****'
-        print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
-        print 'c Theano(ConvOp) processing time: %.3fs'%sum(tctot),tctot
-        print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
-        d=N.asarray(ntot)/tctot
-        print 'speed up c theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
-        d=N.asarray(ntot)/tpytot
-        print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
-
-
-    def init_data(self,shape):
-        return N.ones(shape)
-        return N.random.random(shape)
-        
-    def test_ConvOpGrad(self):
-        """
-        test the gradient in float and double
-        """
-        print '\n\n*************************************************'
-        print '           TEST ConvOp.grad' 
-        print '*************************************************'
-
-        nkern = 3
-        bsize = 2
-        types = ["float32", "float64"]
-        kshps = [(2,3)]
-        imshps = [(2,3,4)]
-        modes = ['valid', 'full']
-        unroll = [(0,0,True),(1,1,False),(2,3,False),(1,1,False),(0,0,False)]#(batch,kern,patch)
-        ssizes = [(1,1)]#,(2,2)]#grad for ss!=(1,1) is currently disabled!
-
-        for typ in types:
-            imgs  = T.TensorType(typ, (False, False, False, False),'imgs')
-            kerns = T.TensorType(typ, (False, False, False, False),'kerns')
-            for mode in modes:
-                for imshp in imshps:
-		    if len(imshp)!=3:
-			visdim = 1
-		    else:
-		        visdim = imshp[0]
-                    imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype)
-                    for kshp in kshps:
-                        t=numpy.array([imshp[1]-kshp[0],imshp[2]-kshp[1]])
-                        kernvals = N.array(self.init_data((nkern,visdim,kshp[0],
-                                                          kshp[1])),dtype=kerns.dtype)
-                        # 'full' mode should support kernels bigger than the input
-                        if mode == 'valid' and (t<0).any():
-                            continue
-                        for un_b,un_k, un_p in unroll:
-                                for ss in ssizes:
-                                    print 'test_ConvOpGrad'
-#                                    print 'mode:',mode,'type:', typ
-#                                    print 'imshp:', imshp,
-#                                    print 'kshp:', kshp
-#                                    print 'un_b:', un_b,
-#                                    print 'un_k:', un_k,
-#                                    print 'un_p:', un_p
-#                                    print 'ss:', ss,
-#                                    print 'bsize:', bsize,
-#                                    print 'nkern:', nkern
-
-                                    def test_i(imgs):
-                                        if un_p and ss[0]==1 and ss[1]==1:
-                                            convop = ConvOp(dx=ss[0], dy=ss[1],
-                                                            output_mode=mode, unroll_patch=un_p)
-                                        else:
-                                            convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
-                                                            output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
-                                        return convop(imgs, kernvals)
-
-                                    def test_k(kerns):
-                                        if un_p and ss[0]==1 and ss[1]==1:
-                                            convop = ConvOp(dx=ss[0], dy=ss[1],
-                                                            output_mode=mode, unroll_patch=un_p)
-                                        else:
-                                            convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
-                                                            output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
-                                        return convop(imgvals, kerns)
-                                    print mode, imshp, kshp, un_b, un_k, ss
-                                    #TODO the tolerance needed to pass is very high for float32(0.17). Is this acceptable? Expected?
-				    tol = None
-                                    if typ=="float32" and (ss[0]!=1 or ss[1]!=1):
-					tol = 0.1
-                                    utt.verify_grad(test_i, [imgvals],
-                                                    cast_to_output_type=True,
-                                                    tol=tol)
-
-                                    utt.verify_grad(test_k, [kernvals],
-                                                    cast_to_output_type=True,
-                                                    tol=tol)
-
-
-if __name__ == '__main__':
-    t = TestConvOp("test_convolution")
-#    t.test_convolution()
-    t.test_multilayer_conv()
-#    from theano.tests import main
-#    main("test_sp")
-    if False:
-        #used to lanch 8 jobs at the same time.
-        bsize = 20 # batch size
-        imshp_start = (1,100,100)#un square shape to test more corner case.
-        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-        nkerns = [20,20] # per output pixel
-        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-        convmodes = ['valid','full']
-        unroll_batch = 5
-        unroll_kern = 2
-        ctot=0
-        tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
-        print "total exec time %.3fs"%tctot
-        
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
+from nnet import *
--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
-import numpy as N
+"""
+Contains an op for convolving input images with a set of filters. This was
+developed especially for Convolutional Neural Networks.
+"""
+
+__docformat__ = "restructuredtext en"
+
+import numpy
 import theano
-import theano.tensor as T
+import theano.tensor as tensor
 from theano import gof, Op, tensor, config
-from theano.printing import Print

-def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
-    """
-    Computes the shape (nb_rows, nb_col) of each output image.
-
-    :type inshp: tuple, list or 1D ndarray of length 2
-    :param inshp: shape of each (2D) input image
-    :type kshp: tuple, list or 1D ndarray of length 2
-    :param kshp: shape of each (2D) kernel filter
-    :type mode: string
-    :param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
-    :rtype: numpy 1D ndarray of len 2
-    :return: shape of each output "image" (or feature map)
-    """
-    if mode=='valid': s = -1
-    else: s = 1
-    inshp, kshp = N.array(inshp), N.array(kshp)
-    return  N.int64(N.ceil((inshp[1:] + s*kshp - s*1)/\
-            N.array([dx,dy], dtype='float')))
+import logging
+_logger=logging.getLogger("theano.signal.conv")
+def _debug(*msg):
+    _logger.debug(' '.join(msg))
+def _warn(*msg):
+    _logger.warn(' '.join(msg))


-def conv2d(input, filters, border_mode='valid', subsample=(1,1), 
-           image_shape=None, filter_shape=None, **kargs):
+def conv2d(input, filters, image_shape=None, filter_shape=None,
+                border_mode='valid', subsample=(1,1), **kargs):
    """
-    This function returns an instanciated ConvOp through a simple interface.
-    We do this instead of changing the ConvOp interface so as not to change 
-    previous code based on the ConvOp.
+    This function will build the symbolic graph for convolving a stack of input
+    images with a set of filters. The implementation is modelled after
+    Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
+    provides a much cleaner interface.

    :type input: symbolic 4D tensor
-    :param input: tensor containing mini-batch of input feature maps that are 
-    2D. Indexing is thus: (batch, feature map, image row, image col).
+    :param input: mini-batch of feature map stacks, of shape image_shape.
+
    :type filters: symbolic 4D tensor
-    :param filters: tensor containing filters for convolutional neural net.
-    Indexing is: (filter, filter input feature map, filter row, filter col).
-    :type border_mode: string
-    :param border_mode:'valid'(only apply kernel over complete patch of the image) or
-    'full'(padd the image with 0 and apply the kernel over all full patch and partial patch of
-    the image
+    :param filters: set of filters used in CNN layer of shape filter_shape
+
+    :param border_mode:
+       'valid'-- only apply filter to complete patches of the image. Generates
+                 output of shape: image_shape - filter_shape + 1 
+       'full' -- zero-pads image to multiple of filter shape to generate output of
+                 shape: image_shape + filter_shape - 1
+
    :type subsample: tuple of len 2
-    :param subsample: how many pixel we move in the (row,col) direction of the image when we
-    change of patch
+    :param subsample: factor by which to subsample the output
+                       
    :type image_shape: tuple of len 4
    :param image_shape: (batch size, stack size, nb row, nb col)
    :type filter_shape: tuple of len 4
-    :param filter_shape: (nb kernel, stack size, nb row, nb col)
+    :param filter_shape: (nb filters, stack size, nb row, nb col)
+
+    :param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
+                   unroll_batch, unroll_kern, unroll_patch (see ConvOp doc)
+    :rtype: symbolic 4D tensor
+    :return: set of feature maps generated by convolutional layer. Tensor is of shape 
+      (batch size, nb filters, output row, output col)
    """
    if image_shape and filter_shape:
        assert image_shape[1]==filter_shape[1]
@@ -72,8 +75,24 @@ def conv2d(input, filters, border_mode='valid', subsample=(1,1),

 class ConvOp(Op):
    """
-    A convolution op that should behave like scipy.signal.convolve2d,
-    but much faster!
+    This Op serves a dual purpose: it can implement a vanilla 2D convolution
+    (as taught in any signal processing class) or implement the
+    convolutional layers found in Convolutional Neural Networks. 
+    
+    In this setting, a set of 3D images is convolved with a set of 3D kernels,
+    with the particularity that their leading dimensions are of equal length.
+    Vanilla 2D convolution is treated as a special case of this.
+   
+    The input parameter represents a mini-batch of multiple images. Its shape is:
+        batch size x num. input feature maps x image height x image width
+
+    The kernel parameter represents a set of 3D kernels. Its shape is:
+        number of filters x num. input images x filter height x filter width 
+
+    The output of ConvOp is a 4D tensor, generated as follows:
+        output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
+    where b is the mini-batch index, k the filter index and * is the convolution
+    operator.  
    """

    __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode', 
@@ -81,6 +100,24 @@ class ConvOp(Op):
            'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
    """These attributes uniquely identify the behaviour of this op for given inputs"""

+    @staticmethod
+    def getOutputShape(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
+        """
+        Computes the output dimensions of convolving an image of shape "inshp"
+        with kernels of shape "kshp".
+        
+        :param inshp: (rows,cols) of input image
+        :param kshp: (rows,cols) of filters
+        :param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
+        :return: (rows,cols) of output image
+        """
+        if mode=='valid': s = -1
+        else: s = 1
+        inshp, kshp = numpy.array(inshp), numpy.array(kshp)
+        return  numpy.int64(numpy.ceil((inshp[1:] + s*kshp - s*1)/\
+                numpy.array([dx,dy], dtype='float')))
+
+
    def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, 
            dx=None, dy=None,
            output_mode='valid', unroll_batch=0,
@@ -92,37 +129,29 @@ class ConvOp(Op):
            verbose=0,
            version=-1):
        """
-        This Op implement the convolution of a kernel(tensor 4d,(nkern, stacksize, nb row, nb
-        col)) on an image(tensor 4d, (batchsize, stacksize, nb row, nb col). The batch size is
-        multiple image that we want to apply the same kernel over. The nkern is numtiple kernel
-        that we want to apply to each image. The stack size is mostly used when their is
-        multiple layer in the network. It is the sum of the convolution of multiple 2d image
-        and kernel.
-
-        The reason that this op does the summation over convolutions within the 'stack' is that
-        it allows us to be memory-efficient about how gradients are calculated.  If, for
-        example, we had a convolution op that took a list of images, a list of kernels, and
-        gave you back each image as filtered by each kernel (JB thought he wanted this at one
-        point) then we would have to sum over a potentially very large tensor to get the
-        gradient on the filters.
-
-        If the imshp, kshp, nkern and bsize are provided, we can generate more optimal code.
-        This make a significant difference for the full mode with unroll_patch version.  The
-        most frequent faster code currently available on 64_x86 computer is unroll_batch=4,
-        unroll_kern=4, unroll_patch=False and this request that all the optional shape
-        information are gived. Those number are empirically tested and backed up by the
-        article: Anatomy of High-Performance Matrix Multiplication by Kazushige Goto and Robert
-        A. Van De Geijn, ACM Transactions on Mathematical Software, vol 34, No. 3, article 12,
-        May 2008. It is in figure 12, it give the value mr x nr, those value are the optimum to
-        use for unroll_batch and unroll_kern. For x86_64 bits computer it is 4x4. Other
-        architecture can have different value.(2x4 for x86, 8x8 for itanium,...)
-
-        :type out_mode: string
-        :param out_mode: 'valid'(give an output smaller then the image, 'full'(give an output
-        bigger then the image)
-
-        optional parameters: (will generate more optimal c code)
-        
+        Initializes a ConvOp with given output_mode (full/valid). All other
+        parameters are optional and are only used to generate more optimized c
+        code.
+
+        NOTES ON OPTIMIZATION:
+        If ALL (imshp, kshp, nkern and bsize) parameters are provided, we can
+        generate faster c-code. This make a significant difference for the
+        'full' output_mode with unroll_patch=True. The current fastest
+        implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
+        unroll_patch=False} with all other shape parameters being provided.
+
+        For optimizing other architectures, see:
+        Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
+        Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
+        Software, May 2008.  
+        Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
+
+        :type output_mode: string
+        :param output_mode: 'valid' -- gives an output smaller then the image
+                            'full' -- gives an output bigger then the image
+
+        Optional parameters: (will generate more optimal c code)
+
        :type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
        :param imshp: (stacksize, nb image row, nb image col)
        :type kshp: tuple of len 2
@@ -136,7 +165,8 @@ class ConvOp(Op):
        :type dy: int
        :param dx: patch stride cols

-        param to select the version of code used:
+        Params which select the version of code used:
+
        :type unroll_patch: bool
        :param unroll_patch: use a version of c_code that unroll the patch loop that don't
        request all shape information to work, but if all shape information are present, will
@@ -149,6 +179,7 @@ class ConvOp(Op):
        :param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and
        the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
        respectively.
+
        :type verbose: int
        :param verbose: passed to GpuConv
        :type version: int
@@ -157,18 +188,14 @@ class ConvOp(Op):
        :param imshp_logical: used internally when we generate the gradient when dx!=1 or dy!=1
        :param kshp_logical: idem
        :param kshp_logical_top_aligned: idem
-        
        """
+
        all_shape = imshp is not None and kshp is not None and \
                    nkern is not None and bsize is not None

        if (unroll_batch>0 or unroll_kern>0) and not all_shape:
            raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")

-        if not all_shape and (imshp is not None or kshp is not None \
-                or nkern is not None or bsize is not None):
-            print "OPTIMISATION WARNING: passing only a few shape to ConvOp "\
-                  "for faster code is useless. We use all of them or none."
        
        if not all_shape:
            unroll_patch = True
@@ -221,10 +248,10 @@ class ConvOp(Op):
                while self.bsize % new!=0:
                    new-=1

-                print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%s)"\
-                      "must be 0 or a divisor of bsize(%s). We revert it to %d. This"\
-                      "won't change the result, but may make it slower."%\
-                      (str(self.unroll_batch),str(self.bsize),new)
+                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
+                      "must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
+                      "won't change the result, but may make it slower."
+                _warn(warnstr % (self.unroll_batch, self.bsize, new))

                self.unroll_batch=new

@@ -238,15 +265,16 @@ class ConvOp(Op):
                assert(new>=1)
                while self.nkern % new!=0:
                    new-=1
-                print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%s)"\
-                      "should be 0 or a divisor of nkern(%s)We revert it to %d."\
-                      "This won't change the result, but may make it slower."\
-                      %(str(self.unroll_kern),str(self.nkern),new)
+
+                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\
+                      "should be 0 or a divisor of nkern(%i). We revert it to %i."\
+                      "This won't change the result, but may make it slower."
+                _warn(warnstr % (self.unroll_kern, self.nkern, new))
                self.unroll_kern=new

        if all_shape:
-            self.outshp = getFilterOutShp(self.imshp_logical, self.kshp_logical, (dx,dy), output_mode)
-            self.fulloutshp = getFilterOutShp(self.imshp_logical, self.kshp_logical, (1,1), output_mode)
+            self.outshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (dx,dy), output_mode)
+            self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (1,1), output_mode)
        else:
            self.outshp = None
            self.fulloutshp = None
@@ -377,10 +405,10 @@ class ConvOp(Op):
        if self.fulloutshp is not None:
            fulloutshp = tuple(self.fulloutshp)
        else:
-            fulloutshp = tuple(getFilterOutShp(imshp_logical, kshp_logical, (1,1), self.out_mode))
+            fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical, kshp_logical, (1,1), self.out_mode))

        if z[0] is None:
-            z[0] = N.zeros((bsize,)+(nkern,)+fulloutshp,
+            z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
                           dtype=img2d.dtype)
        zz=z[0]
        val = _valfrommode(self.out_mode)
@@ -393,17 +421,17 @@ class ConvOp(Op):

        if self.imshp != self.imshp_logical:
            # assuming that to get from imshp to imshp logical we insert zeros in missing spots
-            rstride = int(N.ceil(imshp_logical[1] / float(imshp[1])))
-            cstride = int(N.ceil(imshp_logical[2] / float(imshp[2])))
-            buf = N.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
+            rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
+            cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
+            buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
            buf[:,:,::rstride, ::cstride] = img2d
            img2d = buf
            del buf, rstride, cstride

        if kshp != kshp_logical:
-            rstride = int(N.ceil(kshp_logical[0] / float(kshp[0])))
-            cstride = int(N.ceil(kshp_logical[1] / float(kshp[1])))
-            buf = N.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
+            rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
+            cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
+            buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
            if self.kshp_logical_top_aligned:
                roffset=coffset=0
            else:
@@ -421,6 +449,7 @@ class ConvOp(Op):
                for im0 in range(stacklen):
                    zz[b,n,...] +=  _convolve2d(\
                        img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
+
        #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
        #The copy make that we return an object with the same stride as the c version.
        #The copy don't affect the performence during our experience as in that case we
@@ -432,20 +461,14 @@ class ConvOp(Op):


    def grad(self, (inputs, kerns), (gz,)):
-        """
-        In development. Works for test cases in test_sp.py
-        WARNING: a few known issues:
-            * doesn't work for rectangular images or filters
-            * inputs needs to be a 4D tensor. Couldn't get 3D to work
-            * will crash if filter the same size as input image
-        """
+
        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
            raise NotImplementedError('todo')

-        if self.dx!=1 or self.dy!=1:
-            raise Exception("ERROR: We disable ConvOp.grad now when dx!=1 or "\
-                    "dy!=1 as we think their is a high probability of bug in it."\
-                    "We need to raise the error on the gradient to .1!")
+        #if self.dx!=1 or self.dy!=1:
+            #raise Exception("ERROR: We disable ConvOp.grad now when dx!=1 or "\
+                    #"dy!=1 as we think their is a high probability of bug in it."\
+                    #"We need to raise the error on the gradient to .1!")

        all_shape = self.imshp is not None and self.kshp is not None and \
                    self.nkern is not None and self.bsize is not None
@@ -454,15 +477,6 @@ class ConvOp(Op):
            raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
                            "the optional shape information")
        
-        grad_hack_necessary = False
-        if grad_hack_necessary:
-            if self.dx!=1 or self.dy!=1:
-                upgz = T.as_tensor(N.zeros((self.bsize,self.nkern)+tuple(self.fulloutshp),
-                                           dtype=gz.type.dtype))
-                gz = T.SetSubtensor([slice(self.bsize), slice(self.nkern),
-                                     slice(0,self.fulloutshp[0],self.dy),
-                                     slice(0,self.fulloutshp[1],self.dx)])(upgz,gz)
-
        ####### Determine gradient on kernels ########
        assert inputs.ndim==4 and kerns.ndim==4

@@ -485,7 +499,6 @@ class ConvOp(Op):
            kshp  = self.outshp
            un_b = self.unroll_batch
            un_k = self.unroll_kern
-            #print 'dw_valid', imshp, kshp, nkern, bsize
        elif self.out_mode == 'full':
            (img, filters) = (newgz, newin)
            kshp_logical = None
@@ -497,7 +510,6 @@ class ConvOp(Op):
                kshp  = self.imshp[1:]
            un_b = self.unroll_kern
            un_k = self.unroll_batch
-            #print 'dw_full', imshp, kshp, nkern, bsize
        else:
            raise NotImplementedError('Only [full,valid] modes are currently supported.')

@@ -509,17 +521,16 @@ class ConvOp(Op):
                un_b = bsize
            else:
                un_b = 1
-                print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
-                      "a good unroll value for the batch. Maybe you can optimize this!",\
-                      bsize, un_b, self.unroll_batch, self.unroll_kern
+                _warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
+                      "a good unroll value for the batch. Maybe you can optimize this!")

        if un_k!=0 and nkern%un_k!=0:
            if nkern<un_k:
                un_k = nkern
            else:
                un_k = 1
-                print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
-                      "a good unroll value for the kernel. Maybe you can optimize this!"
+                _warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
+                      "a good unroll value for the kernel. Maybe you can optimize this!")

        dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
                    unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
@@ -572,6 +583,7 @@ class ConvOp(Op):
        din = din(gz,filters)

        assert (din.owner.op.outshp is None and self.imshp is None) or \
+               (din.owner.op.outshp is None) or \
               (din.owner.op.outshp==self.imshp[1:]).all()
        return [din, dw]

@@ -622,12 +634,12 @@ using namespace std;
            d["self_kshp1"]=self.kshp[1]
            d["self_kshp_logical_r"] = self.kshp_logical[0]
            d["self_kshp_logical_c"] = self.kshp_logical[1]
-            d["self_kshp_logical_stride_r"] = int(N.ceil(self.kshp_logical[0] / float(self.kshp[0])))
-            d["self_kshp_logical_stride_c"] = int(N.ceil(self.kshp_logical[1] / float(self.kshp[1])))
-            d["self_imshp_logical_r"] = self.imshp_logical[1] #N.B. 1  not 0
-            d["self_imshp_logical_c"] = self.imshp_logical[2]#N.B. 2  not 1
-            d["self_imshp_logical_stride_r"] = int(N.ceil(self.imshp_logical[1] / float(self.imshp[1])))
-            d["self_imshp_logical_stride_c"] = int(N.ceil(self.imshp_logical[2] / float(self.imshp[2])))
+            d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0])))
+            d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1])))
+            d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1  not 0
+            d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2  not 1
+            d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1])))
+            d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2])))
            if not self.imshp[0]==1: d["affectation"]="+="
            d["all_shape"]=1
            d["dim_zz_const"]="const"
@@ -663,29 +675,30 @@ using namespace std;

        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
            if self.verbose:
-                print "return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version"
+                _debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version")
            return _conv_op_code_a % d

        if self.unroll_patch:
            if self.verbose:
-                print "return unroll patch version. all_shape=", all_shape
+                _debug("return unroll patch version. all_shape=", all_shape)
            return _conv_op_code_unroll_patch%d
        if self.unroll_batch>0 or self.unroll_kern>0:
            if self.unroll_batch<=0: self.unroll_batch=1
            if self.unroll_kern<=0: self.unroll_kern=1
            if self.verbose:
-                print "return unrolled batch and kern code by",self.unroll_batch, self.unroll_kern
+                _debug("return unrolled batch (%s) and kern code (%s)",
+                        str(self.unroll_batch), str(self.unroll_kern))
            return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
                                                   self.unroll_kern)

        #TODO: should we choose the unroll size automatically with the bigger divisor under 5? 
        if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
            if self.verbose:
-                print "return gemm version"
+                _debug("return gemm version")
            return _conv_op_code_valid_gemm % d
        else:
            if self.verbose:
-                print "return no gemm version"
+                _debug("return no gemm version")
            return _conv_op_code_a % d



--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -7,11 +7,11 @@ from theano import gof
 from theano import scalar
 from theano import printing
 from theano.printing import pprint
-import basic as tensor
-import elemwise
-import numpy
-import opt
+from theano.tensor import basic as tensor
+from theano.tensor import elemwise
+from theano.tensor import opt
 from theano.compile import optdb
+import numpy

 ############
 #

--- a/theano/tensor/signal/tests/speed_test_conv.py
+++ b/theano/tensor/signal/tests/speed_test_conv.py
@@ -7,7 +7,7 @@ from theano.tests import unittest_tools as utt

 from theano import function, Mode
 import theano.tensor as T
-from theano.tensor.signal.conv import ConvOp
+from theano.tensor.nnet.conv import ConvOp

 def flip(kern, kshp):
    "flip the kernel as scipy.convolv2d do it flipped."

--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
+import sys, time, unittest
+import numpy
+from scipy import signal
+
+import theano
+import theano.tensor as T
+from theano import function, Mode
+from theano.tests import unittest_tools as utt
+
+from theano.tensor.nnet import conv
+
+from theano.tensor.basic import _allclose
+
+class TestConv2D(unittest.TestCase):
+
+    def setUp(self):
+        utt.seed_rng()
+        self.input = T.dtensor4('input')
+        self.filters = T.dtensor4('filters')
+
+    def validate(self, image_shape, filter_shape,
+                 border_mode='valid', subsample=(1,1),
+                 N_image_shape=None, N_filter_shape=None,
+                 input=None, filters=None, 
+                 unroll_batch=0, unroll_kern=0, unroll_patch=True,
+                 verify_grad=True):
+
+        if N_image_shape is None:
+            N_image_shape = image_shape
+        if N_filter_shape is None:
+            N_filter_shape = filter_shape
+    
+        if not input:
+            input = self.input
+        if not filters:
+            filters = self.filters
+        
+        ############# THEANO IMPLEMENTATION ############
+        
+        # we create a symbolic function so that verify_grad can work
+        def sym_conv2d(input, filters):
+            # define theano graph and function
+            return conv.conv2d(input, filters, image_shape, filter_shape,
+                          border_mode, subsample, unroll_batch=unroll_batch,
+                          unroll_kern=unroll_kern, unroll_patch=unroll_patch)
+
+        output = sym_conv2d(input, filters)
+        theano_conv = theano.function([input, filters], output)
+          
+        # initialize input and compute result
+        image_data  = numpy.random.random(N_image_shape)
+        filter_data = numpy.random.random(N_filter_shape)
+        theano_output = theano_conv(image_data, filter_data)
+
+        ############# REFERENCE IMPLEMENTATION ############
+        s = 1. if border_mode is 'full' else -1.
+        out_shape2d = numpy.array(N_image_shape[-2:]) +\
+                      s*numpy.array(N_filter_shape[-2:]) - s
+        out_shape2d = numpy.ceil(out_shape2d / numpy.array(subsample))
+        out_shape = (N_image_shape[0],N_filter_shape[0]) + tuple(out_shape2d)
+        ref_output = numpy.zeros(out_shape)
+
+        # loop over output feature maps
+        for k in range(N_filter_shape[0]):
+            # loop over input feature maps
+            for l in range(N_filter_shape[1]):
+
+                filter2d = filter_data[k,l,:,:]
+
+                # loop over mini-batches
+                for b in range(N_image_shape[0]):
+                    image2d = image_data[b,l,:,:]
+                    output2d = signal.convolve2d(image2d, filter2d, border_mode)
+
+                    ref_output[b,k,:,:] +=\
+                       output2d[::subsample[0],::subsample[1]]
+
+        self.failUnless(_allclose(theano_output, ref_output))
+
+        ############# TEST GRADIENT ############
+        if verify_grad:
+            utt.verify_grad(sym_conv2d, [image_data, filter_data])
+
+
+    def test_basic(self):
+        """
+        Tests that basic convolutions work for odd and even dimensions of image and filter
+        shapes, as well as rectangular images and filters.
+        """
+        self.validate((3,2,8,8), (4,2,5,5), 'valid')
+        self.validate((3,2,7,5), (5,2,2,3), 'valid')
+        self.validate((3,2,7,5), (5,2,3,2), 'valid')
+        self.validate((3,2,8,8), (4,2,5,5), 'full')
+        self.validate((3,2,7,5), (5,2,2,3), 'full')
+        # test filter same size as input
+        self.validate((3,2,3,3), (4,2,3,3), 'valid')
+
+    def test_unroll_patch_false(self):
+        """
+        unroll_patch is True by default. Test basic convs with False.
+        """
+        self.validate((3,2,7,5), (5,2,2,3), 'valid', unroll_patch=False)
+        self.validate((3,2,7,5), (5,2,2,3), 'full', unroll_patch=False)
+        self.validate((3,2,3,3), (4,2,3,3), 'valid', unroll_patch=False)
+
+    def test_unroll_special(self):
+        """
+        (unroll_kern, unroll_batch) in (0,1),(1,0) is special case.
+        """
+        self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=1)
+
+    def test_unroll_batch(self):
+        """
+        Test mini-batch unrolling for various legal values.
+        """
+        # mini-batch of size 6 is multiple of 2 and 3. Should work.
+        self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=2, verify_grad=False)
+        self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=3, verify_grad=False)
+
+    def test_unroll_kern(self):
+        """
+        Test kernel unrolling for various legal values.
+        """
+        # 6 filters is a multiple of 2 and 3. Should work.
+        self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=2, verify_grad=False)
+        self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=3, verify_grad=False)
+
+    def test_subsample(self):
+        """
+        Tests convolution where subsampling != (1,1)
+        """
+        self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,2))
+        self.validate((3,2,7,5), (5,2,2,3), 'full', subsample=(2,2))
+        self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,1))
+
+    def test_invalid_filter_shape(self):
+        """
+        Tests scenario where filter_shape[1] != input_shape[1]
+        """
+        def f():
+            self.validate((3,2,8,8), (4,3,5,5), 'valid')
+        self.failUnlessRaises(AssertionError, f)
+
+    def test_missing_info(self):
+        """
+        Test convolutions for various pieces of missing info.
+        """
+        self.validate(None, None, 
+                      N_image_shape=(3,2,8,8), 
+                      N_filter_shape=(4,2,5,5))
+        self.validate((3,2,None,None), None,
+                      N_image_shape=(3,2,8,8), 
+                      N_filter_shape=(4,2,5,5))
+        self.validate((None,2,None,None), (None,2,5,5),
+                      N_image_shape=(3,2,8,8), 
+                      N_filter_shape=(4,2,5,5))
+
+    def test_full_mode(self):
+        """
+        Tests basic convolution in full mode and case where filter 
+        is larger than the input image.
+        """
+        self.validate((3,2,5,5), (4,2,8,8), 'full')
+        def f():
+            self.validate((3,2,5,5), (4,2,8,8), 'valid')
+        self.failUnlessRaises(Exception, f)
+
+    def test_wrong_input(self):
+        """
+        Make sure errors are raised when image and kernel are not 4D tensors
+        """
+        try:
+            self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dmatrix())
+            self.validate((3,2,8,8), (4,2,5,5), 'valid', filters = T.dvector())
+            self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dtensor3())
+            # should never reach here
+            self.fail()
+        except: 
+            pass
--- a/theano/tensor/tests/test_nnet.py
+++ b/theano/tensor/tests/test_nnet.py
@@ -3,9 +3,9 @@ import unittest
 import theano
 from theano import tensor as T
 from theano import gof
-import test_basic as TT
 import numpy
 from theano.tests import unittest_tools as utt
+from theano.tensor.tests import test_basic as TT

 from theano.tensor.nnet import *


--- a/theano/tensor/signal/conv.py
+++ b/theano/tensor/signal/conv.py
 """
-Contains an op for convolving input images with a set of filters. This was
-developed especially for Convolutional Neural Networks.
+Contains a wrapper function for tensor.nnet.ConvOp, which can be used to perform
+generic 2D convolution.
 """

 __docformat__ = "restructuredtext en"
@@ -8,7 +8,9 @@ __docformat__ = "restructuredtext en"
 import numpy
 import theano
 import theano.tensor as tensor
+import theano.tensor.nnet as nnet
 from theano import gof, Op, tensor, config
+from theano.tensor.nnet import conv

 import logging
 _logger=logging.getLogger("theano.signal.conv")
@@ -19,1664 +21,76 @@ def _warn(*msg):


 def conv2d(input, filters, image_shape=None, filter_shape=None,
-                border_mode='valid', subsample=(1,1), **kargs):
+           border_mode='valid', subsample=(1,1), **kargs):
    """
-    This function will build the symbolic graph for convolving a stack of input
-    images with a set of filters. The implementation is modelled after
-    Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
-    provides a much cleaner interface.
+    signal.conv.conv2d performs a basic 2D convolution of the input with the
+    given filters. The input parameter can be a single 2D image or a 3D tensor,
+    containing a set of images. Similarly, filters can be a single 2D filter or
+    a 3D tensor, corresponding to a set of 2D filters.

-    :type input: symbolic 4D tensor
-    :param input: mini-batch of feature map stacks, of shape image_shape.
-
-    :type filters: symbolic 4D tensor
-    :param filters: set of filters used in CNN layer of shape filter_shape
-
-    :param border_mode:
-       'valid'-- only apply filter to complete patches of the image. Generates
-                 output of shape: image_shape - filter_shape + 1 
-       'full' -- zero-pads image to multiple of filter shape to generate output of
-                 shape: image_shape + filter_shape - 1
-
-    :type subsample: tuple of len 2
-    :param subsample: factor by which to subsample the output
-                       
-    :type image_shape: tuple of len 4
-    :param image_shape: (batch size, stack size, nb row, nb col)
-    :type filter_shape: tuple of len 4
-    :param filter_shape: (nb filters, stack size, nb row, nb col)
-
-    :param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
-                   unroll_batch, unroll_kern, unroll_patch (see ConvOp doc)
-    """
-    if image_shape and filter_shape:
-        assert image_shape[1]==filter_shape[1]
-
-    if filter_shape is not None:
-        nkern = filter_shape[0]
-        kshp = filter_shape[2:]
-    else:
-        nkern, kshp = None, None
-
-    if image_shape is not None:
-        bsize = image_shape[0]
-        imshp = image_shape[1:]
-    else:
-        bsize, imshp = None, None
-
-    op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
-                imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
-
-    return op(input, filters)
-
-
-class ConvOp(Op):
-    """
-    This Op serves a dual purpose: it can implement a vanilla 2D convolution
-    (as taught in any signal processing class) or implement the
-    convolutional layers found in Convolutional Neural Networks. 
+    Shape parameters are optional and will result in faster execution.
    
-    In this setting, a set of 3D images is convolved with a set of 3D kernels,
-    with the particularity that their leading dimensions are of equal length.
-    Vanilla 2D convolution is treated as a special case of this.
-   
-    The input parameter represents a mini-batch of multiple images. Its shape is:
-        batch size x num. input feature maps x image height x image width
-
-    The kernel parameter represents a set of 3D kernels. Its shape is:
-        number of filters x num. input images x filter height x filter width 
-
-    The output of ConvOp is a 4D tensor, generated as follows:
-        output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
-    where b is the mini-batch index, k the filter index and * is the convolution
-    operator.  
+    :type input: dmatrix of dtensor3
+    :param input: symbolic variable for images to be filtered
+    :type filters: dmatrix of dtensor3
+    :param filters: symbolic variable containing filter values
+    :param border_mode: 'valid' or 'full'. see scipy.signal.convolve2d
+    :param subsample: factor by which to subsample output
+    :type image_shape: tuple of length 2 or 3
+    :param image_shape: ([number images,] image height, image width)
+    :type filter_shape: tuple of length 2 or 3
+    :param filter_shape: ([number filters,] filter height, filter width)
+    :param kwargs: see theano.tensor.nnet.conv.conv2d
+    :rtype: symbolic 2D,3D or 4D tensor
+    :return: tensor of filtered images, with shape
+             ([number images,] [number filters,] image height, image width)
    """
+    assert input.ndim in (2,3)
+    assert filters.ndim in (2,3)

-    __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode', 
-            'unroll_batch', 'unroll_kern', 'unroll_patch',
-            'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
-    """These attributes uniquely identify the behaviour of this op for given inputs"""
-
-    @staticmethod
-    def getOutputShape(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
-        """
-        Computes the output dimensions of convolving an image of shape "inshp"
-        with kernels of shape "kshp".
-        
-        :param inshp: (rows,cols) of input image
-        :param kshp: (rows,cols) of filters
-        :param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
-        :return: (rows,cols) of output image
-        """
-        if mode=='valid': s = -1
-        else: s = 1
-        inshp, kshp = numpy.array(inshp), numpy.array(kshp)
-        return  numpy.int64(numpy.ceil((inshp[1:] + s*kshp - s*1)/\
-                numpy.array([dx,dy], dtype='float')))
-
-
-    def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, 
-            dx=None, dy=None,
-            output_mode='valid', unroll_batch=0,
-            unroll_kern=0,
-            unroll_patch=True,
-            imshp_logical=None,
-            kshp_logical=None,
-            kshp_logical_top_aligned=True,
-            verbose=0,
-            version=-1):
-        """
-        Initializes a ConvOp with given output_mode (full/valid). All other
-        parameters are optional and are only used to generate more optimized c
-        code.
-
-        NOTES ON OPTIMIZATION:
-        If ALL (imshp, kshp, nkern and bsize) parameters are provided, we can
-        generate faster c-code. This make a significant difference for the
-        'full' output_mode with unroll_patch=True. The current fastest
-        implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
-        unroll_patch=False} with all other shape parameters being provided.
-
-        For optimizing other architectures, see:
-        Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
-        Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
-        Software, May 2008.  
-        Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
-
-        :type output_mode: string
-        :param output_mode: 'valid' -- gives an output smaller then the image
-                            'full' -- gives an output bigger then the image
-
-        Optional parameters: (will generate more optimal c code)
-
-        :type imshp: tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
-        :param imshp: (stacksize, nb image row, nb image col)
-        :type kshp: tuple of len 2
-        :param kshp: (nb kernel row, nb kernel col)
-        :type nkern: int
-        :param nkern: the number of kernel
-        :type bsize: int
-        :param bsize: the size of the minibatch
-        :type dx: int
-        :param dx: patch stride rows
-        :type dy: int
-        :param dx: patch stride cols
-
-        Params which select the version of code used:
-
-        :type unroll_patch: bool
-        :param unroll_patch: use a version of c_code that unroll the patch loop that don't
-        request all shape information to work, but if all shape information are present, will
-        use it to hardcode the value in the code for faster code.
-        :type unroll_batch:int
-        :param unroll_batch: use a version of c_code that unroll the batch(by unroll_batch) and
-        the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
-        respectively.
-        :type unroll_kern:int
-        :param unroll_kern: use a version of c_code that unroll the batch(by unroll_batch) and
-        the nkern(by unroll_kern) loop. The size must by a multiple of bsize or nkern
-        respectively.
-
-        :type verbose: int
-        :param verbose: passed to GpuConv
-        :type version: int
-        :param version: passed to GpuConv
-
-        :param imshp_logical: used internally when we generate the gradient when dx!=1 or dy!=1
-        :param kshp_logical: idem
-        :param kshp_logical_top_aligned: idem
-        """
-
-        all_shape = imshp is not None and kshp is not None and \
-                    nkern is not None and bsize is not None
-
-        if (unroll_batch>0 or unroll_kern>0) and not all_shape:
-            raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")
-
-        
-        if not all_shape:
-            unroll_patch = True
-
-        if imshp is not None:
-            imshp = tuple(imshp)
-
-            if len(imshp)==2:
-                imshp = (1,)+imshp
-            elif len(imshp)==3:
-                imshp = imshp
-            else:
-                raise Exception("bad len for imshp")
-
-        self.imshp = imshp
-        if kshp is not None:
-            kshp = tuple(kshp)
-
-        self.kshp = kshp
-        self.nkern = nkern
-        self.bsize=bsize
-        self.dx=dx
-        self.dy=dy
-        self.verbose=verbose
-        self.version=version
-
-        # a triple
-        self.imshp_logical = self.imshp
-        if imshp_logical is not None: self.imshp_logical = tuple(imshp_logical)
-        assert (self.imshp is None and self.imshp_logical is None) or \
-               (len(self.imshp) == len(self.imshp_logical))
-
-        # a pair
-        self.kshp_logical = self.kshp
-        if kshp_logical is not None: self.kshp_logical = tuple(kshp_logical)
-        self.kshp_logical_top_aligned = kshp_logical_top_aligned
-
-        self.unroll_batch=unroll_batch
-        self.unroll_kern=unroll_kern
-        self.unroll_patch=unroll_patch
-
-        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
-
-            if self.bsize<=self.unroll_batch:
-                self.unroll_batch = self.bsize
-            else:
-                #find the maximum value under unroll_batch that would work
-                new=self.unroll_batch
-                assert(new>=1)
-                while self.bsize % new!=0:
-                    new-=1
-
-                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
-                      "must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
-                      "won't change the result, but may make it slower."
-                _warn(warnstr % (self.unroll_batch, self.bsize, new))
-
-                self.unroll_batch=new
-
-        if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
-
-            if self.nkern<=self.unroll_kern:
-                self.unroll_kern = self.nkern
-            else:
-                #find the maximum value under unroll_kern that would work
-                new=self.unroll_kern
-                assert(new>=1)
-                while self.nkern % new!=0:
-                    new-=1
-
-                warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%i)"\
-                      "should be 0 or a divisor of nkern(%i). We revert it to %i."\
-                      "This won't change the result, but may make it slower."
-                _warn(warnstr % (self.unroll_kern, self.nkern, new))
-                self.unroll_kern=new
-
-        if all_shape:
-            self.outshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (dx,dy), output_mode)
-            self.fulloutshp = ConvOp.getOutputShape(self.imshp_logical, self.kshp_logical, (1,1), output_mode)
-        else:
-            self.outshp = None
-            self.fulloutshp = None
-
-        self.out_mode = output_mode
-
-        if not self.out_mode in ["valid", "full"]:
-            raise Exception("Mode %s not implemented"%self.out_mode)
-       
-        if all_shape and not (self.outshp > 0).all():
-            raise Exception(("Bad size for the output shape. Verify that [post-"\
-                    "supersampling] input shape (%s) and kern shape(%s) are ok. "\
-                    "(Hint: kerns must fit inside image in valid mode)")%
-                    (self.imshp_logical,self.kshp_logical))
-
-        self._rehash()
-        if config.op.set_flops:
-            self.set_flops()
-
-    def __eq__(self, other):
-        if type(self) != type(other):
-            return False
-        for a in self.__attrnames:
-            if getattr(self, a) != getattr(other, a):
-                return False
-        return True
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self._rehash()
-
-    def _rehash(self):
-        hashval = hash(type(self))
-        for a in self.__attrnames:
-            hashval = hashval ^ hash(getattr(self, a))
-        self.__hashval = hashval
-
-    def __hash__(self):
-        return self.__hashval
-
-    def __str__(self):
-        return "ConvOp{" +",".join(str((a, getattr(self, a))) for a in self.__attrnames)  + "}"
-
-    def set_flops(self):
-        """ Usefull with the hack in profilemode to print the MFlops"""
-        if self.out_mode=="valid":
-            self.flops=self.kshp[0]*self.kshp[1]*2#nb mul and add by output pixed
-            self.flops*=self.outshp[0]*self.outshp[1]#nb flops by output image
-            self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
-        else: #full mode not implemented
-
-            self.flops=0
-            for out_row in range(self.outshp[0]):#loop over output row
-                for out_col in range(self.outshp[0]):#loop over output col
-                    for row in range(self.kshp[0]):#loop over kern row
-
-                        if (row+out_row-self.kshp[0]+1<0 or 
-                            row+out_row-self.kshp[0]+1>=self.imshp[1]): 
-                            continue
-
-                        col=0
-                        max_col=self.kshp[1]
-                        img_col=out_col-self.kshp[1]+1
-                        max_col=min(max_col,self.imshp[2]-img_col)
-
-                        if img_col<0:
-                            col=-img_col
-                            img_col+=col
-                        while col < max_col: #loop over kern col
-                            self.flops+=2
-                            col+=1
-            
-            self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
-            
-            assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
-                    self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2
-
-    def make_node(self, inputs, kerns):
-        # TODO: find a way to make ConvOp work for N-D (after NIPS09)
-        """
-        inputs - 4 dim: batches x stacksize x rows x cols
-        kerns - 4 dim: nkern x stackidx x rows x cols
-        """
-        outdim = kerns.ndim
-        _inputs = tensor.as_tensor_variable(inputs)
-        _kerns = tensor.as_tensor_variable(kerns)
-        # TODO: lift this restriction by upcasting either inputs or kerns
-        if _inputs.ndim != 4:
-            raise TypeError('make_node requires 4D tensor of inputs')
-        if _kerns.ndim != 4:
-            raise TypeError('make_node requires 4D tensor of kernels')
-        if _inputs.type.dtype != _kerns.type.dtype:
-            raise NotImplementedError("The image and the kernel must have the same type."
-                            "inputs(%s), kerns(%s)"%(_inputs.dtype, _kerns.dtype))
-        output = tensor.tensor(dtype=_inputs.type.dtype,
-                               broadcastable=[_inputs.broadcastable[0],
-                                   _kerns.broadcastable[0], False, False]); 
-
-        return gof.Apply(self, [_inputs, _kerns], [output])
-
-    def perform(self,node, (img2d, filtersflipped), (z,)):
-        """
-        By default if len(img2d.shape)==3, we
-        """
-        # TODO: move these back out to global scope when they no longer cause an atexit error
-        from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
-        from scipy.signal.sigtools import _convolve2d
-        imshp = self.imshp
-        if imshp is None:
-            imshp = tuple(img2d.shape[1:])
-        kshp = self.kshp
-        if kshp is None:
-            kshp = tuple(filtersflipped.shape[2:])
-        bsize = self.bsize
-        if bsize is None:
-            bsize = img2d.shape[0]
-        nkern = self.nkern
-        if nkern is None:
-            nkern = filtersflipped.shape[0]
-        
-        imshp_logical = self.imshp_logical
-        if imshp_logical is None:
-            imshp_logical = imshp
-        kshp_logical = self.kshp_logical
-        if kshp_logical is None:
-            kshp_logical = kshp
-            
-        if self.fulloutshp is not None:
-            fulloutshp = tuple(self.fulloutshp)
+    ### use shape information if it is given to us ###
+    if filter_shape and image_shape:
+        if input.ndim==3: 
+            bsize = image_shape[0]
        else:
-            fulloutshp = tuple(ConvOp.getOutputShape(imshp_logical, kshp_logical, (1,1), self.out_mode))
-
-        if z[0] is None:
-            z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
-                           dtype=img2d.dtype)
-        zz=z[0]
-        val = _valfrommode(self.out_mode)
-        bval = _bvalfromboundary('fill')
-
-        stacklen = imshp[0]
-
-        img2d = img2d.reshape((bsize,)+ imshp)
-        filtersflipped = filtersflipped.reshape((nkern,stacklen)+kshp)
-
-        if self.imshp != self.imshp_logical:
-            # assuming that to get from imshp to imshp logical we insert zeros in missing spots
-            rstride = int(numpy.ceil(imshp_logical[1] / float(imshp[1])))
-            cstride = int(numpy.ceil(imshp_logical[2] / float(imshp[2])))
-            buf = numpy.zeros((bsize,)+ imshp_logical, dtype=img2d.dtype)
-            buf[:,:,::rstride, ::cstride] = img2d
-            img2d = buf
-            del buf, rstride, cstride
-
-        if kshp != kshp_logical:
-            rstride = int(numpy.ceil(kshp_logical[0] / float(kshp[0])))
-            cstride = int(numpy.ceil(kshp_logical[1] / float(kshp[1])))
-            buf = numpy.zeros((nkern,stacklen)+ self.kshp_logical, dtype=filtersflipped.dtype)
-            if self.kshp_logical_top_aligned:
-                roffset=coffset=0
-            else:
-                roffset=(kshp_logical[0] - (kshp[0]*rstride) - 1+rstride) % rstride
-                coffset=(kshp_logical[1] - (kshp[1]*cstride) - 1+cstride) % cstride
-                assert roffset >= 0
-                assert coffset >= 0
-            buf[:,:,roffset::rstride, coffset::cstride] = filtersflipped
-            filtersflipped = buf
-            del buf, rstride, cstride
-
-        for b in range(bsize):
-            for n in range(nkern):
-                zz[b,n,...].fill(0)
-                for im0 in range(stacklen):
-                    zz[b,n,...] +=  _convolve2d(\
-                        img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
-
-        #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
-        #The copy make that we return an object with the same stride as the c version.
-        #The copy don't affect the performence during our experience as in that case we
-        #execute the c version which is much faster.
-        if self.dx>1 or self.dy>1:
-            zz = zz[:,:,0::self.dx,0::self.dy].copy()
-
-        z[0]=zz
-
-
-    def grad(self, (inputs, kerns), (gz,)):
-
-        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
-            raise NotImplementedError('todo')
-
-        #if self.dx!=1 or self.dy!=1:
-            #raise Exception("ERROR: We disable ConvOp.grad now when dx!=1 or "\
-                    #"dy!=1 as we think their is a high probability of bug in it."\
-                    #"We need to raise the error on the gradient to .1!")
-
-        all_shape = self.imshp is not None and self.kshp is not None and \
-                    self.nkern is not None and self.bsize is not None
-
-        if not all_shape and (self.dx!=1 or self.dy!=1):
-            raise Exception("ConvOp.grad when dx!=1 or dy!=1 we must have all "\
-                            "the optional shape information")
-        
-        ####### Determine gradient on kernels ########
-        assert inputs.ndim==4 and kerns.ndim==4
-
-        newin = inputs.dimshuffle((1,0,2,3))
-        newgz = gz.dimshuffle((1,0,2,3))
-    
-        (bsize, nkern) = None, None
-        imshp = None
-        kshp = None
-        un_p = self.unroll_patch
-        imshp_logical = None
-
-        if self.out_mode == 'valid':
-            (img, filters) = (newin, newgz)
-            kshp_logical = self.fulloutshp
-            kshp_logical_top_aligned=False
-            if all_shape:
-                (bsize, nkern) = (self.imshp[0], self.nkern)
-                imshp = (self.bsize, self.imshp[1], self.imshp[2])
-            kshp  = self.outshp
-            un_b = self.unroll_batch
-            un_k = self.unroll_kern
-        elif self.out_mode == 'full':
-            (img, filters) = (newgz, newin)
-            kshp_logical = None
-            kshp_logical_top_aligned=True
-            if all_shape:
-                imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1])
-                (bsize, nkern) = (self.nkern, self.imshp[0])
-                imshp = (self.bsize, self.outshp[0], self.outshp[1])
-                kshp  = self.imshp[1:]
-            un_b = self.unroll_kern
-            un_k = self.unroll_batch
+            bsize = 1
+        imshp = (1,) + tuple(image_shape[-2:])
+ 
+        if filters.ndim==3: 
+            nkern = filter_shape[0]
        else:
-            raise NotImplementedError('Only [full,valid] modes are currently supported.')
-
-        filters = filters[:,:,::-1,::-1] #flip them
-        
-        #find good value for the unroll
-        if all_shape and un_b!=0 and bsize%un_b!=0:
-            if bsize<un_b:
-                un_b = bsize
-            else:
-                un_b = 1
-                _warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
-                      "a good unroll value for the batch. Maybe you can optimize this!")
-
-        if un_k!=0 and nkern%un_k!=0:
-            if nkern<un_k:
-                un_k = nkern
-            else:
-                un_k = 1
-                _warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
-                      "a good unroll value for the kernel. Maybe you can optimize this!")
-
-        dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
-                    unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
-                    imshp_logical=imshp_logical,
-                    kshp_logical=kshp_logical,
-                    kshp_logical_top_aligned=kshp_logical_top_aligned,
-                    version=self.version,
-                    verbose=self.verbose)
-
-        if hasattr(self,'flops'):
-            dw.set_flops()
-
-        dw = dw(img,filters)
-
-        if all_shape:
-            assert (dw.owner.op.outshp==self.kshp).all()
-        if self.out_mode == 'valid':
-            # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
-            dw = dw.dimshuffle((1,0,2,3))
-            dw = dw[:,:,::-1,::-1]
-
-        ####### Determine gradient on inputs ########
-        mode = 'valid'
-        if not self.out_mode == 'full': 
-            mode = 'full'
-
-        filters = kerns.dimshuffle((1,0,2,3))
-        filters = filters[:,:,::-1,::-1]
-        nkern = None
-        imshp = None
-        imshp_logical = None
-        kshp = None
-
-        if all_shape:
-            nkern = self.imshp[0]
-            imshp = (self.nkern, self.outshp[0], self.outshp[1])
-            imshp_logical=(self.nkern, self.fulloutshp[0], self.fulloutshp[1])
-
-        din = ConvOp(imshp, self.kshp, nkern, self.bsize, 
-                     1,1, output_mode=mode,
-                     unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p,
-                     imshp_logical=imshp_logical,
-                     kshp_logical=None,
-                     version=-1,#we we change the mode, we don't forward the version.
-                     verbose=self.verbose)
-
-        if hasattr(self,'flops'):
-            din.set_flops()
-
-        din = din(gz,filters)
+            nkern = 1
+        kshp = filter_shape[-2:]
+    else:
+        nkern, kshp = None, None
+        bsize, imshp = None, None

-        print 'self.imshp = ', self.imshp
-        print 'din.owner.op.outshp = ', din.owner.op.outshp
-        assert (din.owner.op.outshp is None and self.imshp is None) or \
-               (din.owner.op.outshp is None) or \
-               (din.owner.op.outshp==self.imshp[1:]).all()
-        return [din, dw]
+    ### reshape tensors to 4D, for compatibility with ConvOp ###
+    if input.ndim==3:
+        sym_bsize = input.shape[0]
+    else:
+        sym_bsize = 1

-    def c_headers(self):
-        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]
+    if filters.ndim==3:
+        sym_nkern = filters.shape[0]
+    else:
+        sym_nkern = 1

-    def c_code_cache_version(self):
-        return (1)
+    new_input_shape = tensor.join(0, tensor.stack(sym_bsize,1), input.shape[-2:])
+    input4D = tensor.reshape(input, new_input_shape, ndim=4)
    
-    def c_support_code(self):
-        return """
-#define STRIDES(arr) ((arr)->strides)
-#define FULL  2
-#define SAME  1
-#define VALID 0
-#define MOD %
-using namespace std;
-""" + tensor.blas.blas_header_text()
-
-    def c_libraries(self):
-        return tensor.blas.ldflags()
-
-    def c_code(self, node, name, (img2d, filtersflipped), (z, ), sub):
-        if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
-            raise NotImplementedError()
-        assert node.inputs[0].type.dtype == node.inputs[1].type.dtype
-        d=locals()
-        d.update(sub)
-
-        all_shape = self.imshp is not None and self.kshp is not None and \
-                    self.nkern is not None and self.bsize is not None
-
-        d["self_out_mode"]=self.out_mode
-        d["self_dx"]=self.dx
-        d["self_dy"]=self.dy
-        d["mode"]=self.out_mode.upper()
-        d["mode"]=self.out_mode.upper()
-        d["affectation"]="="
-        if all_shape:
-            d["self_bsize"]=self.bsize
-            d["self_nkern"]=self.nkern
-            d["self_outshp0"]=self.outshp[0]
-            d["self_outshp1"]=self.outshp[1]
-            d["self_imshp0"]=self.imshp[0]
-            d["self_imshp1"]=self.imshp[1]
-            d["self_imshp2"]=self.imshp[2]
-            d["self_kshp0"]=self.kshp[0]
-            d["self_kshp1"]=self.kshp[1]
-            d["self_kshp_logical_r"] = self.kshp_logical[0]
-            d["self_kshp_logical_c"] = self.kshp_logical[1]
-            d["self_kshp_logical_stride_r"] = int(numpy.ceil(self.kshp_logical[0] / float(self.kshp[0])))
-            d["self_kshp_logical_stride_c"] = int(numpy.ceil(self.kshp_logical[1] / float(self.kshp[1])))
-            d["self_imshp_logical_r"] = self.imshp_logical[1] #numpy.B. 1  not 0
-            d["self_imshp_logical_c"] = self.imshp_logical[2]#numpy.B. 2  not 1
-            d["self_imshp_logical_stride_r"] = int(numpy.ceil(self.imshp_logical[1] / float(self.imshp[1])))
-            d["self_imshp_logical_stride_c"] = int(numpy.ceil(self.imshp_logical[2] / float(self.imshp[2])))
-            if not self.imshp[0]==1: d["affectation"]="+="
-            d["all_shape"]=1
-            d["dim_zz_const"]="const"
-        else:
-            d["self_bsize"]="%(img2d)s->dimensions[0]"%d
-            d["self_nkern"]="%(filtersflipped)s->dimensions[0]"%d
-            d["self_outshp0"]="-1"
-            d["self_outshp1"]="-1"
-            d["self_imshp0"]="%(img2d)s->dimensions[1]"%d
-            d["self_imshp1"]="%(img2d)s->dimensions[2]"%d
-            d["self_imshp2"]="%(img2d)s->dimensions[3]"%d
-            d["self_kshp0"]="%(filtersflipped)s->dimensions[2]"%d
-            d["self_kshp1"]="%(filtersflipped)s->dimensions[3]"%d
-            d["affectation"]="+="
-            d["all_shape"]=0
-            d["dim_zz_const"]=""
-
-        if self.kshp_logical_top_aligned:
-            d["self_kshp_logical_offset_r"] = 0
-            d["self_kshp_logical_offset_c"] = 0
-        elif self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
-            rstride = d["self_kshp_logical_stride_r"]
-            cstride = d["self_kshp_logical_stride_c"]
-            d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride
-            d["self_kshp_logical_offset_c"] = (self.kshp_logical[1] - (self.kshp[1]*cstride) - 1+cstride) % cstride
-            del rstride, cstride
-        
-        if node.inputs[0].type.dtype=="float32": d["type"]="float"
-        elif node.inputs[0].type.dtype=="float64": d["type"]="double"
-        else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
-        d["gemm"]='dgemm_'
-        if not d["type"]=="double":d["gemm"]='sgemm_'
-
-        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
-            if self.verbose:
-                _debug("return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version")
-            return _conv_op_code_a % d
-
-        if self.unroll_patch:
-            if self.verbose:
-                _debug("return unroll patch version. all_shape=", all_shape)
-            return _conv_op_code_unroll_patch%d
-        if self.unroll_batch>0 or self.unroll_kern>0:
-            if self.unroll_batch<=0: self.unroll_batch=1
-            if self.unroll_kern<=0: self.unroll_kern=1
-            if self.verbose:
-                _debug("return unrolled batch (%s) and kern code (%s)",
-                        str(self.unroll_batch), str(self.unroll_kern))
-            return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
-                                                   self.unroll_kern)
-
-        #TODO: should we choose the unroll size automatically with the bigger divisor under 5? 
-        if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
-            if self.verbose:
-                _debug("return gemm version")
-            return _conv_op_code_valid_gemm % d
-        else:
-            if self.verbose:
-                _debug("return no gemm version")
-            return _conv_op_code_a % d
-
-
-_conv_op_code_a = """
-const int mode=%(mode)s;
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
-const %(type)s fill_value = 0;
-
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-
-npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-npy_intp dim_im_phys[2]={%(self_imshp1)s,%(self_imshp2)s};
-npy_intp dim_im_log[2]={%(self_imshp_logical_r)s,%(self_imshp_logical_c)s};
-npy_intp dim_ker_phys[2]={%(self_kshp0)s,%(self_kshp1)s};
-npy_intp dim_ker_log[2]={%(self_kshp_logical_r)s,%(self_kshp_logical_c)s};
-
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
-
-
-if(%(img2d)s->nd==2){
-  img2d_dim[3]=%(img2d)s->dimensions[1];
-  img2d_dim[2]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==3){
-  img2d_dim[3]=%(img2d)s->dimensions[2];
-  img2d_dim[2]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==4){
-  img2d_dim[3]=%(img2d)s->dimensions[3];
-  img2d_dim[2]=%(img2d)s->dimensions[2];
-  img2d_dim[1]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else {
-    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
-    %(fail)s;
-}
-
-if(%(filtersflipped)s->nd==3){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else if(%(filtersflipped)s->nd==4){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[3];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else{
-    std:stringstream temp;
-    temp << "nddim="<<%(filtersflipped)s->nd;
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("kernel don't have a good shape. " + param).c_str());
-    %(fail)s;
-}
-
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    if (!PyArray_ISCONTIGUOUS(img2d)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-img2d_arr = (PyArrayObject*)img2d;
-
-filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
-     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
-    Py_DECREF(filtersflipped);
-    filtersflipped = contig;
-    if (!PyArray_ISCONTIGUOUS(filtersflipped)){
-        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
-        %(fail)s;
-    }
-}
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-
-if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
-}
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-
-if (!img2d) %(fail)s;
-if (!filtersflipped) %(fail)s;
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(%(z)s->dimensions[0] != %(self_bsize)s)
-  ||(%(z)s->dimensions[1] != %(self_nkern)s)
-  ||(%(z)s->dimensions[2] != dim_zz[0])
-  || (%(z)s->dimensions[3] != dim_zz[1])
-  )
-{
-  {Py_XDECREF(%(z)s);}
-  npy_intp dims[4] = {0,0,0,0};
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-
-int Os[2];
-Os[0]=%(self_outshp0)s;
-Os[1]=%(self_outshp1)s;
-
-for(int b=0;b< %(self_bsize)s;b++){
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
-
-    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
-
-    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
-    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
-
-    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
-
-      const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
-      const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
-
-
-      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-                                               /// Reposition index into input image based on requested output size
-        int pos_m = iter_m*%(self_dx)s;        //row position in logical output image
-        int new_m;                             //row anchor in logical input image (we will loop upward from here)
-        if (mode == FULL) new_m = pos_m ;
-        else new_m = (pos_m+dim_ker_log[0]-1);
-
-        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-          int pos_n=iter_n*%(self_dy)s;        // current col position in logical output image
-          %(type)s sum=0;
-
-          // Sum over kernel, if index into image is out of bounds
-          // fill with the value
-          for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) { // loop over logical rows in kernel
-
-            int ind0_log = (new_m-j_log);                                   // ind0_log: row position in logical input image
-
-            if ((j_log < %(self_kshp_logical_offset_r)s) || (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
-                continue;
-
-            if (ind0_log MOD %(self_imshp_logical_stride_r)s)
-                continue;
-
-            int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) / %(self_kshp_logical_stride_r)s);
-            int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s);
-            //std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n";
-
-            if(mode==FULL){
-              const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]]; //This is a pointer to the current row of the kernel
-              if(ind0_log < 0 || ind0_log >= dim_im_log[0]){
-                   // the current row of the kernel is off the image
-              }else{
-                int k = max((int)(pos_n-dim_im_log[1])+1,0);
-                int max_k=min(pos_n+1,(int)dim_ker_log[1]);
-                const %(type)s * idx_in=&in[ind0_phys*dim_im_phys[1]];
-                for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) {
-                    if (1)
-                    {
-                                if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
-                                    continue;
-
-                                if (ind1_log MOD %(self_imshp_logical_stride_c)s)
-                                    continue;
-                    }
-                  sum+= idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s] * idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
-                }
-              }
-            }else{
-              const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
-              const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
-              int new_n = (pos_n+dim_ker_log[1]-1);
-              if (%(self_imshp_logical_stride_c)s != 1)  // a general loop
-              {
-                  for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) {
-                        if ((k < %(self_kshp_logical_offset_c)s) || (k - %(self_kshp_logical_offset_c)s) MOD %(self_kshp_logical_stride_c)s)
-                            continue;
-
-                        else if (last MOD %(self_imshp_logical_stride_c)s)
-                            continue;
-                            else
-                            {
-                    sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) / %(self_kshp_logical_stride_c)s]*idx_in[last/%(self_imshp_logical_stride_c)s];
-                    }
-                  }
-              }
-              else  // self_imshp_stride_c == 1
-              {
-                  int offset = %(self_kshp_logical_offset_c)s;
-                  int k_phys=0;
-                  for (int k_log=offset,last=new_n-offset; k_log < dim_ker_log[1]; ) {
-                    sum += idx_hvals[k_phys]*idx_in[last];
-                    ++k_phys;
-                    last -= %(self_kshp_logical_stride_c)s;
-                    k_log += %(self_kshp_logical_stride_c)s;
-                  }
-              }
-            }
-          }//for j
-          out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-        }//for n
-      }//for m
-    }//for stack_size
-    if (0 && (mode==FULL)){
-      for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) 
-        std::cout << " " << out[i];
-      std::cout << "\\n";
-    }
-  }//for n_kern
-}//for b
-Py_XDECREF(img2d);
-Py_XDECREF(filtersflipped);
-"""
-
-
-#########  
-#########  ConvOp c_code for valid mode (uses gemm)
-#########
-
-_conv_op_code_valid_gemm = """
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *img2d_arr=NULL;
-const int NKERN = %(self_nkern)s;
-
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-
-npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
-npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
-
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig;
-
-if(%(img2d)s->nd==2){
-  img2d_dim[3]=%(img2d)s->dimensions[1];
-  img2d_dim[2]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==3){
-  img2d_dim[3]=%(img2d)s->dimensions[2];
-  img2d_dim[2]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==4){
-  img2d_dim[3]=%(img2d)s->dimensions[3];
-  img2d_dim[2]=%(img2d)s->dimensions[2];
-  img2d_dim[1]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else {
-    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
-    %(fail)s;
-}
-
-if(%(filtersflipped)s->nd==3){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else if(%(filtersflipped)s->nd==4){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[3];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else{
-    std:stringstream temp;
-    temp << "nddim="<<%(filtersflipped)s->nd;
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("kernel don't have a good shape. " + param).c_str());
-    %(fail)s;
-}
-if (NKERN != kerns_dim[0])
-{
-    PyErr_SetString(PyExc_NotImplementedError, "nonsense nkern");
-    %(fail)s;
-}
-
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    if (!PyArray_ISCONTIGUOUS(img2d)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-img2d_arr = (PyArrayObject*)img2d;
-
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-
-if (!img2d) {
-    PyErr_SetString(PyExc_ValueError, "Null argument img2d");
-    %(fail)s;
-}
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(%(z)s->dimensions[0] != %(self_bsize)s)
-  ||(%(z)s->dimensions[1] != %(self_nkern)s)
-  ||(%(z)s->dimensions[2] != dim_zz[0])
-  || (%(z)s->dimensions[3] != dim_zz[1])
-  )
-{
-  {Py_XDECREF(%(z)s);}
-  npy_intp dims[4] = {0,0,0,0};
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-
-int Os[2];
-Os[0] = dim_im[0]-dim_ker[0]+1;
-Os[1] = dim_im[1]-dim_ker[1]+1;
-
-// allocate a temporary buffer for storing the inner product of each nth kernel row 
-// with each row of an image
-{
-%(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* (npy_intp)sizeof(%(type)s));
-int kbufstride = NKERN;
-%(type)s * myfilters = kbuf + Os[0] * NKERN;
-
-//copy out filtersflipped into filters un-flipped format
-//std::cerr << "__filling myfilters__\\n";
-for(int i=0;i < kerns_dim[0];++i){
-    for(int j=0;j < kerns_dim[1];++j){
-        for(int k=0;k < kerns_dim[2];++k){
-            for(int l=0;l < kerns_dim[3];++l){
-                %(type)s * ff = ((%(filtersflipped)s)->nd == 3)
-                    ? (%(type)s *)PyArray_GETPTR3(%(filtersflipped)s, i, kerns_dim[2]-1-k, kerns_dim[3]-1-l)
-                    : (%(type)s *)PyArray_GETPTR4(%(filtersflipped)s, i, j, kerns_dim[2]-1-k, kerns_dim[3]-1-l);
-                myfilters[i * (kerns_dim[1]*kerns_dim[2]*kerns_dim[3]) 
-                          + j * (kerns_dim[2]*kerns_dim[3])
-                          + k * (kerns_dim[3])
-                          + l] = ff[0];
-                //std::cerr << " " << ff[0];
-            }
-            //std::cerr << "\\n";
-        }
-        //std::cerr << "(end of stack/batch " <<j << "/" << i << "  ) \\n";
-    }
-}
+    new_filter_shape = tensor.join(0, tensor.stack(sym_nkern,1), filters.shape[-2:])
+    filters4D = tensor.reshape(filters, new_filter_shape, ndim=4)

-//std::cerr << "-----new loop ----\\n";
-for(int b=0;b< %(self_bsize)s;b++){
-    for (int img_col = 0; img_col < Os[1]; ++img_col){
-        for (int filter_row = 0; filter_row < kerns_dim[2]; ++filter_row){
-            for (int stackidx = 0; stackidx < %(self_imshp0)s; ++stackidx){
-                %(type)s * img_colview = 
-                    (%(type)s *)(PyArray_GETPTR4(img2d, b, stackidx, filter_row, img_col));
-                %(type)s * filter_rows = myfilters + stackidx * (kerns_dim[2]*kerns_dim[3]) +
-                filter_row * kerns_dim[3];
-                //std::cerr << "filterview offset: " << filter_rows - myfilters << "\\n";
-
-                char N = 'N'; char T = 'T';
-                int Nz0 = Os[0]; 
-                int Nz1 = NKERN;
-                int K = kerns_dim[3];
-                %(type)s alpha = 1.0;
-                %(type)s beta = stackidx ? 1.0 : 0.0;
-                int imgview_stride = dim_im[1];
-                int filter_rows_stride =kerns_dim[1]*kerns_dim[2]*kerns_dim[3];
-                //remember, Fortran wants a column-major interpretation
-                assert(img2d->strides[3] == (npy_intp)sizeof(%(type)s));
-
-                if (0){
-                    std::cerr << "b " << b << " img_col " << img_col << " filterrow " << filter_row << " stackidx " <<stackidx << "\\n";
-                    std::cerr << "colview (physical layout) stride: " << imgview_stride << "\\n";
-                    for (int ii = 0; ii < Nz0; ++ii){
-                        for (int jj = 0; jj < K; ++jj){
-                            std::cerr << " " << img_colview[ii * imgview_stride + jj];
-                        }
-                        std::cerr << "\\n";
-                    }
-                    std::cerr << "filterview ("<<filter_row<<"'th rows) stride: " << filter_rows_stride << "\\n";
-                    for (int ii = 0; ii < Nz1; ++ii){
-                        for (int jj = 0; jj < K; ++jj){
-                            std::cerr << " " << filter_rows[ii * filter_rows_stride + jj];
-                        }
-                        std::cerr << "\\n";
-                    }
-
-                    std::cerr << Nz1 << " " << Nz0 << " " << K << "\\n" ;
-                }
-
-                %(gemm)s(&T, &N, 
-                    &Nz1, &Nz0, &K,
-                    &alpha, 
-                    filter_rows, &filter_rows_stride,
-                    img_colview, &imgview_stride, 
-                    &beta, kbuf, &kbufstride);
-
-                if (0){
-                    std::cerr << "z (logical layout) beta" << beta << "\\n";
-                    for (int ii = 0; ii < Nz0; ++ii){
-                        for (int jj = 0; jj < Nz1; ++jj){
-                            std::cerr << " " << kbuf[ii * kbufstride + jj];
-                        }
-                        std::cerr << "\\n";
-                    }
-                }
-            }
-            // now kbuf the sum over the stack, put it into the outbuf
-            for (int img_row = 0; img_row < Os[0]; ++img_row) {
-                for (int kernel_idx = 0; kernel_idx < NKERN; ++kernel_idx) {
-                    %(type)s * z_p =  (%(type)s *)PyArray_GETPTR4(%(z)s, b, kernel_idx, img_row, img_col);
-                    if (0)
-                    {
-                        if (b >= %(z)s->dimensions[0]) %(fail)s;
-                        if (kernel_idx >= %(z)s->dimensions[1]) %(fail)s;
-                        if (img_row >= %(z)s->dimensions[2]) %(fail)s;
-                        if (img_col >= %(z)s->dimensions[3]) %(fail)s;
-                    }
-                    z_p[0] += kbuf[img_row * kbufstride + kernel_idx];
-                }
-            }
-        }
-    }
-}
-free(kbuf);
-}
-Py_XDECREF(img2d);
-"""
-
-def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
-    """ c_code for ConvOp that unroll the batch size loop
-    """
-    assert unroll_bsize>0 and unroll_ksize>0
-    if d.has_key("unroll_bsize") or d.has_key("unroll_ksize") or d.has_key("unroll_iter") or d.has_key("unroll_biter") or d.has_key("unroll_kiter"):
-        raise Exception("We can't use this dictionnary as we will overwrite some of its containt")
-    d=d.copy()
-        
-    d["unroll_bsize"]=unroll_bsize
-    d["unroll_ksize"]=unroll_ksize
-    def my_dup(st,size):
-        s=""
-        for i in range(size):
-            d["unroll_iter"]=i
-            s+=st%d
-        return s+"\n"
-    def my_dup2(st):
-        s=""
-        iter=0
-        for i in range(unroll_bsize):
-            d["unroll_biter"]=i
-            for j in range(unroll_ksize):
-                d["unroll_kiter"]=j
-                d["unroll_iter"]=iter
-                iter+=1
-                s+=st%d
-        return s+"\n"
-    ret = """
-const int mode=%(mode)s;
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
-const %(type)s fill_value = 0;
-
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-
-npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
-npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
-
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
-
-if(%(img2d)s->nd==2){
-  img2d_dim[3]=%(img2d)s->dimensions[1];
-  img2d_dim[2]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==3){
-  img2d_dim[3]=%(img2d)s->dimensions[2];
-  img2d_dim[2]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==4){
-  img2d_dim[3]=%(img2d)s->dimensions[3];
-  img2d_dim[2]=%(img2d)s->dimensions[2];
-  img2d_dim[1]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else {
-    std:stringstream temp;
-    temp << "nddim="<<%(img2d)s->nd;
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("img don't have a good shape. " + param).c_str());
-    %(fail)s;
-}
-
-if(%(filtersflipped)s->nd==3){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else if(%(filtersflipped)s->nd==4){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[3];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else{
-    PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
-    %(fail)s;
-}
-
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    if (!PyArray_ISCONTIGUOUS(img2d)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-img2d_arr = (PyArrayObject*)img2d;
-
-filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
-     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
-    Py_DECREF(filtersflipped);
-    filtersflipped = contig;
-    if (!PyArray_ISCONTIGUOUS(filtersflipped)){
-        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
-        %(fail)s;
-    }
-}
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-
-if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
-}
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-
-if (!img2d) %(fail)s;
-if (!filtersflipped) %(fail)s;
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(%(z)s->dimensions[0] != %(self_bsize)s)
-  ||(%(z)s->dimensions[1] != %(self_nkern)s)
-  ||(%(z)s->dimensions[2] != dim_zz[0])
-  || (%(z)s->dimensions[3] != dim_zz[1])
-  )
-{
-  {Py_XDECREF(%(z)s);}
-  npy_intp dims[4] = {0,0,0,0};
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-
-int Os[2];
-Os[0]=%(self_outshp0)s;
-Os[1]=%(self_outshp1)s;
-
-for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
-
-    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
-"""%d
-    ret+=my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
-    ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",unroll_bsize*unroll_ksize)
-    ret+="""
-    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
-"""%d
-    ret+=my_dup("const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unroll_iter)s,stack_size));", unroll_bsize)
-    ret+=my_dup("const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unroll_iter)s,stack_size));",unroll_ksize)
-    ret+="""
-
-      int new_m;
-
-      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-        // Reposition index into input image based on requested output size
-        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-        if (mode == FULL) new_m = pos_m ;
-        else new_m = (pos_m+dim_ker[0]-1);
-
-        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns 
-          int pos_n=iter_n*%(self_dy)s;
-        """%d
-    ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
-    ret+="""
-
-          // Sum over kernel, if index into image is out of bounds
-          // fill with the value
-          for (int j=0; j < dim_ker[0]; j++) {
-            int ind0 = (new_m-j);
-
-            if(mode==FULL){
-"""%d
-    ret+=my_dup("const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
-    ret+="""
-              if(ind0 < 0 || ind0 >= dim_im[0]){
-                if(fill_value!=0)
-                  for (int k=0; k < dim_ker[1]; k++) {
-"""%d
-    ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
-    ret+="""
-                  }
-              }else{
-                //do the part where kernel is to the right of the img
-
-                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
-                if(fill_value!=0){ 
-                
-                  for(k=0;k<max_k;k++){
-"""%d
-    ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
-    ret+="""
-                  }
-                }else {k=max_k;}
-                
-                //do the part where the kernel is on the img
-                max_k=min(pos_n+1,(int)dim_ker[1]);
-"""%d
-    ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
-    ret+="""
-                for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-
-"""%d
-    ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
-    ret+="""
-                }
-                //do the part to the left of the img
-                if(fill_value!=0)
-                  for(;k<dim_ker[1];k++){
-"""%d
-    ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
-    ret+="""
-                  }
-              }
-            }else{//valid mode
-"""%d
-    ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
-    ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
-    ret+="""
-              int new_n = (pos_n+dim_ker[1]-1);
-
-              for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
-"""%d
-    ret+=my_dup2("sum%(unroll_iter)s+=idx_hvals%(unroll_kiter)s[k]*idx_in%(unroll_biter)s[last];")
-    ret+="""
-              }
-            }
-
-          }//for j
-"""%d
-    ret+=my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
-    ret+="""
-        }//for n
-      }//for m
-    }//for stack_size
-  }//for n_kern
-}//for b
-Py_XDECREF(img2d);
-Py_XDECREF(filtersflipped);
-"""
-    return ret
-
-_conv_op_code_unroll_patch = """
-const int mode=%(mode)s;
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
-const %(type)s fill_value = 0;//only value of 0 are currently tested and correctly implemented
-
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-
-const npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
-const npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
-%(dim_zz_const)s npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-#if !%(all_shape)s
-  if (mode == FULL) {
-    dim_zz[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s));
-    dim_zz[1] = (int)ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));
-  } else {
-    dim_zz[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s));
-    dim_zz[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));
-  }
-#endif
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
-
-if(%(img2d)s->nd==2){
-  img2d_dim[3]=%(img2d)s->dimensions[1];
-  img2d_dim[2]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==3){
-  img2d_dim[3]=%(img2d)s->dimensions[2];
-  img2d_dim[2]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else if(%(img2d)s->nd==4){
-  img2d_dim[3]=%(img2d)s->dimensions[3];
-  img2d_dim[2]=%(img2d)s->dimensions[2];
-  img2d_dim[1]=%(img2d)s->dimensions[1];
-  img2d_dim[0]=%(img2d)s->dimensions[0];
-}else {
-    PyErr_Format(PyExc_ValueError,
-      "image don't have a good number of dimensions %%d. ", %(filtersflipped)s->nd);
-    %(fail)s;
-}
-
-if(%(filtersflipped)s->nd==3){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else if(%(filtersflipped)s->nd==4){
-  kerns_dim[3]=%(filtersflipped)s->dimensions[3];
-  kerns_dim[2]=%(filtersflipped)s->dimensions[2];
-  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
-  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
-}else{
-    PyErr_Format(PyExc_ValueError,
-      "kernel don't have a good number of dimensions %%d. ", %(filtersflipped)s->nd);
-    %(fail)s;
-}
-
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != sizeof(%(type)s)) 
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    if (!PyArray_ISCONTIGUOUS(img2d)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-img2d_arr = (PyArrayObject*)img2d;
-
-filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((filtersflipped_arr->strides[3] != sizeof(%(type)s)) 
-     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
-    Py_DECREF(filtersflipped);
-    filtersflipped = contig;
-    if (!PyArray_ISCONTIGUOUS(filtersflipped)){
-        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
-        %(fail)s;
-    }
-}
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-
-if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
-}
-
-if(dim_zz[0]<=0 || dim_zz[1]<=0){
-PyErr_Format(PyExc_ValueError,
-      "Output dimensions are not valid %%dx%%d",dim_zz[0],dim_zz[1]);
-      %(fail)s;
-}
-
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-
-if (!img2d) %(fail)s;
-if (!filtersflipped) %(fail)s;
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(%(z)s->dimensions[0] != %(self_bsize)s)
-  ||(%(z)s->dimensions[1] != %(self_nkern)s)
-  ||(%(z)s->dimensions[2] != dim_zz[0])
-  || (%(z)s->dimensions[3] != dim_zz[1])
-  )
-{
-  if (%(z)s) Py_DECREF(%(z)s);
-  npy_intp dims[4] = {0,0,0,0};
-  if(!dims) %(fail)s;
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-
-for(int b=0;b< %(self_bsize)s;b++){
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
-
-    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
-
-    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
-    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
-
-    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
-
-      const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
-      const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
-
-      int new_m;
-
-      for (int iter_m=0; iter_m < dim_zz[0]; iter_m++) {
-        // Reposition index into input image based on requested output size
-        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-        if (mode == FULL) new_m = pos_m ;
-        else new_m = (pos_m+dim_ker[0]-1);
+    ### perform actual convolution ###
+    op = conv.ConvOp(output_mode=border_mode, 
+                dx=subsample[0], dy=subsample[1],
+                imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)

-        for (int iter_n=0; iter_n < dim_zz[1]; iter_n++) {  // loop over columns
-          int pos_n=iter_n*%(self_dy)s;
-          %(type)s sum=0;
-          %(type)s sum2=0;
-          %(type)s sum3=0;
-          %(type)s sum4=0;
-          int nb_sum=0;
-          // Sum over kernel, if index into image is out of bounds
-          // fill with the value
-          for (int j=0; j < dim_ker[0]; j++) {
-            int ind0 = (new_m-j);
+    output = op(input4D, filters4D)

-            if(mode==FULL){
-              const %(type)s * idx_hvals=&hvals[j*dim_ker[1]];
-              if(ind0 < 0 || ind0 >= dim_im[0]){
-                if(fill_value!=0)
-                  for (int k=0; k < dim_ker[1]; k++) {
-                    sum+= idx_hvals[k] * fill_value;
-                  }
-              }else{
-                //do the part where kernel is to the right of the img
-                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
-                if(fill_value!=0){ 
-                
-                  for(k=0;k<max_k;k++){
-                    sum+= idx_hvals[k]*fill_value;
-                  }
-                }else {k=max_k;}
-                
-                //do the part where the kernel is on the img
-                max_k=min(pos_n+1,(int)dim_ker[1]);
-                const %(type)s * idx_in=&in[ind0*dim_im[1]];
+    # flatten to 3D tensor if convolving with single filter or single image
+    if input.ndim==2 or filters.ndim==2:
+        output = tensor.flatten(output.T, outdim=3).T

-                if(iter_n + 4*%(self_dy)s < dim_zz[1]
-                         && iter_n>dim_ker[1]-1+3 
-                         && iter_n<dim_im[1]-dim_ker[1]+1-3){
-                  nb_sum=4;
-                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-                    sum+=idx_hvals[k]*idx_in[ind1];
-                    sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
-                    sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s];
-                    sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s];
-                  }
-                }else if(iter_n + 2*%(self_dy)s < dim_zz[1] 
-                         && iter_n>dim_ker[1]-1
-                         && iter_n<dim_im[1]-dim_ker[1]+1){
-                  nb_sum=2;
-                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-                    sum+=idx_hvals[k]*idx_in[ind1];
-                    sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
-                  }
-                }else{
-                  nb_sum=1;
-                  /*
-                  %(type)s sum_=0;
-                  if((k-max_k) & 0x1 != 0){
-                    sum+= idx_hvals[k] * idx_in[pos_n-k];
-                  }
-                  for (int ind1=pos_n-k; k<max_k; k+=2,ind1-=2) {
-                    sum+= idx_hvals[k] * idx_in[ind1];
-                    sum_+= idx_hvals[k+1] * idx_in[ind1-1];
-                  }
-                  sum+=sum_;
-                  */
-                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-                    sum+=idx_hvals[k]*idx_in[ind1];
-                  }
-                }
-                //do the part to the left of the img
-                if(fill_value!=0)
-                  for(;k<dim_ker[1];k++) sum+= idx_hvals[k]*fill_value;
-              }
-            }else{//valid mode
-              const %(type)s* idx_in=&in[ind0*dim_im[1]];
-              const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
-              if(iter_n + 4*%(self_dy)s < dim_zz[1]){
-                nb_sum=4;
-                for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
-                  sum+=idx_hvals[k]*idx_in[im_idx];
-                  sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
-                  sum3+=idx_hvals[k]*idx_in[im_idx+2*%(self_dy)s];
-                  sum4+=idx_hvals[k]*idx_in[im_idx+3*%(self_dy)s];
-                }
-              }else if(iter_n + 2*%(self_dy)s < dim_zz[1]){
-                nb_sum=2;
-                for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
-                  sum+=idx_hvals[k]*idx_in[im_idx];
-                  sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
-                }
-              }else{
-                nb_sum=1;
-                for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
-                  sum+=idx_hvals[k]*idx_in[im_idx];
-                }
-              }
-            }//else valid mode
-          }//for j
-          switch(nb_sum){
-          case 4: out[iter_m*dim_zz[1]+iter_n+3] %(affectation)s sum4;
-          case 3: out[iter_m*dim_zz[1]+iter_n+2] %(affectation)s sum3;
-          case 2: out[iter_m*dim_zz[1]+iter_n+1] %(affectation)s sum2;
-          case 1: out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-          }
-          iter_n+=nb_sum-1;
-        }//for iter_n
-      }//for iter_m
-    }//for stack_size
-  }//for n_kern
-}//for b
-Py_XDECREF(img2d);
-Py_XDECREF(filtersflipped);
-"""
+    return output
--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -15,13 +15,13 @@ def max_pool2D(input, ds, ignore_border=False):
    Takes as input a N-D tensor, where N >= 2. It downscales the input image by
    the specified factor, by keeping only the maximum value of non-overlapping
    patches of size (ds[0],ds[1])
+
    :type input: N-D theano tensor of input images. 
    :param input: input images. Max pooling will be done over the 2 last dimensions.
    :type ds: tuple of length 2
-    :param ds: factor by which to downscale. (2,2) will halve the image in each
-               dimension.
-    :param ignore_border: boolean value. When True, (5,5) input with ds=(2,2)
-                         will generate a (2,2) output. (3,3) otherwise.
+    :param ds: factor by which to downscale. (2,2) will halve the image in each dimension.
+    :param ignore_border: boolean value. When True, (5,5) input with ds=(2,2) will generate a
+      (2,2) output. (3,3) otherwise.
    """
    if input.ndim < 2:
        raise NotImplementedError('max_pool2D requires a dimension >= 2')

--- a/theano/tensor/signal/tests/test_conv.py
+++ b/theano/tensor/signal/tests/test_conv.py
@@ -11,169 +11,78 @@ from theano.tensor.signal import conv

 from theano.tensor.basic import _allclose

-class TestConv2D(unittest.TestCase):
+class TestSignalConv2D(unittest.TestCase):

    def setUp(self):
        utt.seed_rng()
-        self.input = T.dtensor4('input')
-        self.filters = T.dtensor4('filters')
-
-    def validate(self, image_shape, filter_shape,
-                 border_mode='valid', subsample=(1,1),
-                 N_image_shape=None, N_filter_shape=None,
-                 input=None, filters=None, 
-                 unroll_batch=0, unroll_kern=0, unroll_patch=True,
-                 verify_grad=True):
-
-        if N_image_shape is None:
-            N_image_shape = image_shape
-        if N_filter_shape is None:
-            N_filter_shape = filter_shape
-    
-        if not input:
-            input = self.input
-        if not filters:
-            filters = self.filters
+
+    def validate(self, image_shape, filter_shape, verify_grad=True):
        
+        image_dim = len(image_shape)
+        filter_dim = len(filter_shape)
+        input = T.TensorType('float64', [False]*image_dim)()
+        filters = T.TensorType('float64', [False]*filter_dim)()
+
+        bsize = image_shape[0] if image_dim==3 else 1
+        nkern = filter_shape[0] if filter_dim==3 else 1
+
        ############# THEANO IMPLEMENTATION ############
-        
        # we create a symbolic function so that verify_grad can work
        def sym_conv2d(input, filters):
-            # define theano graph and function
-            return conv.conv2d(input, filters, image_shape, filter_shape,
-                          border_mode, subsample, unroll_batch=unroll_batch,
-                          unroll_kern=unroll_kern, unroll_patch=unroll_patch)
-
+            return conv.conv2d(input, filters)
        output = sym_conv2d(input, filters)
        theano_conv = theano.function([input, filters], output)
          
        # initialize input and compute result
-        image_data  = numpy.random.random(N_image_shape)
-        filter_data = numpy.random.random(N_filter_shape)
+        image_data  = numpy.random.random(image_shape)
+        filter_data = numpy.random.random(filter_shape)
        theano_output = theano_conv(image_data, filter_data)
-
+        
        ############# REFERENCE IMPLEMENTATION ############
-        s = 1. if border_mode is 'full' else -1.
-        out_shape2d = numpy.array(N_image_shape[-2:]) +\
-                      s*numpy.array(N_filter_shape[-2:]) - s
-        out_shape2d = numpy.ceil(out_shape2d / numpy.array(subsample))
-        out_shape = (N_image_shape[0],N_filter_shape[0]) + tuple(out_shape2d)
-        ref_output = numpy.zeros(out_shape)
+        out_shape2d = numpy.array(image_shape[-2:]) -\
+                      numpy.array(filter_shape[-2:]) + 1
+        ref_output = numpy.zeros(tuple(out_shape2d))

-        # loop over output feature maps
-        for k in range(N_filter_shape[0]):
-            # loop over input feature maps
-            for l in range(N_filter_shape[1]):
+        # reshape as 3D input tensors to make life easier
+        image_data3d  = image_data.reshape((bsize,)+image_shape[-2:])
+        filter_data3d = filter_data.reshape((nkern,)+filter_shape[-2:])
+        # reshape theano output as 4D to make life easier
+        theano_output4d = theano_output.reshape((bsize,nkern,)+theano_output.shape[-2:])

-                filter2d = filter_data[k,l,:,:]
+        # loop over mini-batches (if required)
+        for b in range(bsize):

-                # loop over mini-batches
-                for b in range(N_image_shape[0]):
-                    image2d = image_data[b,l,:,:]
-                    output2d = signal.convolve2d(image2d, filter2d, border_mode)
+            # loop over filters (if required)
+            for k in range(nkern):

-                    ref_output[b,k,:,:] +=\
-                       output2d[::subsample[0],::subsample[1]]
+                image2d = image_data3d[b,:,:]
+                filter2d = filter_data3d[k,:,:]
+                output2d = signal.convolve2d(image2d, filter2d, 'valid')

-        self.failUnless(_allclose(theano_output, ref_output))
+                self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d))

        ############# TEST GRADIENT ############
        if verify_grad:
            utt.verify_grad(sym_conv2d, [image_data, filter_data])

-
    def test_basic(self):
        """
-        Tests that basic convolutions work for odd and even dimensions of image and filter
-        shapes, as well as rectangular images and filters.
-        """
-        self.validate((3,2,8,8), (4,2,5,5), 'valid')
-        self.validate((3,2,7,5), (5,2,2,3), 'valid')
-        self.validate((3,2,7,5), (5,2,3,2), 'valid')
-        self.validate((3,2,8,8), (4,2,5,5), 'full')
-        self.validate((3,2,7,5), (5,2,2,3), 'full')
-        # test filter same size as input
-        self.validate((3,2,3,3), (4,2,3,3), 'valid')
-
-    def test_unroll_patch_false(self):
-        """
-        unroll_patch is True by default. Test basic convs with False.
-        """
-        self.validate((3,2,7,5), (5,2,2,3), 'valid', unroll_patch=False)
-        self.validate((3,2,7,5), (5,2,2,3), 'full', unroll_patch=False)
-        self.validate((3,2,3,3), (4,2,3,3), 'valid', unroll_patch=False)
-
-    def test_unroll_special(self):
-        """
-        (unroll_kern, unroll_batch) in (0,1),(1,0) is special case.
-        """
-        self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=1)
-
-    def test_unroll_batch(self):
-        """
-        Test mini-batch unrolling for various legal values.
-        """
-        # mini-batch of size 6 is multiple of 2 and 3. Should work.
-        self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=2, verify_grad=False)
-        self.validate((6,2,3,3), (3,2,2,2), 'valid', unroll_batch=3, verify_grad=False)
-
-    def test_unroll_kern(self):
-        """
-        Test kernel unrolling for various legal values.
-        """
-        # 6 filters is a multiple of 2 and 3. Should work.
-        self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=2, verify_grad=False)
-        self.validate((2,3,3,3), (6,3,2,2), 'valid', unroll_kern=3, verify_grad=False)
-
-    def test_subsample(self):
-        """
-        Tests convolution where subsampling != (1,1)
-        """
-        self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,2))
-        self.validate((3,2,7,5), (5,2,2,3), 'full', subsample=(2,2))
-        self.validate((3,2,7,5), (5,2,2,3), 'valid', subsample=(2,1))
-
-    def test_invalid_filter_shape(self):
-        """
-        Tests scenario where filter_shape[1] != input_shape[1]
-        """
-        def f():
-            self.validate((3,2,8,8), (4,3,5,5), 'valid')
-        self.failUnlessRaises(AssertionError, f)
-
-    def test_missing_info(self):
-        """
-        Test convolutions for various pieces of missing info.
-        """
-        self.validate(None, None, 
-                      N_image_shape=(3,2,8,8), 
-                      N_filter_shape=(4,2,5,5))
-        self.validate((3,2,None,None), None,
-                      N_image_shape=(3,2,8,8), 
-                      N_filter_shape=(4,2,5,5))
-        self.validate((None,2,None,None), (None,2,5,5),
-                      N_image_shape=(3,2,8,8), 
-                      N_filter_shape=(4,2,5,5))
-
-    def test_full_mode(self):
-        """
-        Tests basic convolution in full mode and case where filter 
-        is larger than the input image.
+        Basic functionality of nnet.conv.ConvOp is already tested by its own test suite.  We
+        just have to test whether or not signal.conv.conv2d can support inputs and filters of
+        type matrix or tensor3.
        """
-        self.validate((3,2,5,5), (4,2,8,8), 'full')
-        def f():
-            self.validate((3,2,5,5), (4,2,8,8), 'valid')
-        self.failUnlessRaises(Exception, f)
+        self.validate((3,7,5), (5,2,3), 'valid')
+        self.validate((7,5), (5,2,3), 'valid')
+        self.validate((3,7,5), (2,3), 'valid')
+        self.validate((7,5), (2,3), 'valid')

-    def test_wrong_input(self):
+    def test_fail(self):
        """
-        Make sure errors are raised when image and kernel are not 4D tensors
+        Test that conv2d fails for dimensions other than 2 or 3.
        """
        try:
-            self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dmatrix())
-            self.validate((3,2,8,8), (4,2,5,5), 'valid', filters = T.dvector())
-            self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dtensor3())
-            # should never reach here
+            conv.conv2d(T.dtensor4(), T.dtensor3())
+            conv.conv2d(T.dtensor3(), T.dvector())
            self.fail()
-        except: 
+        except:
            pass