Advance the new GpuConv

baf12f54 · Frederic · 50677701 · baf12f54 · baf12f54 · baf12f54
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
+import theano
+from theano import gof
+
+
+class GpuConv(gof.Op):
+    """
+    Implement the batched and stacked 2d convolution on the gpu.
+    """
+    @staticmethod
+    def logical_output_shape_2d(imshp, kshp, mode):
+        if mode == 'valid':
+            return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
+        if mode == 'full':
+            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
+        raise ValueError(mode)
+
+    def __init__(self, border_mode,
+            subsample=(1, 1),
+            logical_img_hw=None,
+            logical_kern_hw=None,
+            logical_kern_align_top=True,
+            version=-1,
+            verbose=0,
+            kshp=None,
+            imshp=None,
+            max_threads_dim0=None):
+        """
+        :param version: each version of c_code implement many kernel for the
+                        convolution. By default we try to guess the best one.
+                        You can force one version with this parameter. This
+                        parameter is used by the tests.
+        :param verbose: for value of 1,2 and 3. Print more information during
+                        the execution of the convolution. Mostly used for
+                        optimization or debugging.
+        :param kshp:    The size of the kernel. If provided, can genera
+                        faster code. If the GpuConv op is automatically
+                        inserted,
+                        we take its value automatically from the Conv op.
+        :param imshp:   The size of the image. Not used for code generation but
+                        allow to select an experimental new version in another
+                        repo.
+        :param max_threads_dim0: The maximum number of thread for the
+                        block size dimensions 0 (blockDim.x) used by the
+                        GPU function.
+
+        """
+        self.border_mode = border_mode
+        self.subsample = subsample
+        if logical_img_hw is not None:
+            h, w = logical_img_hw
+            #TODO: reconsider this... since shapes are not given in
+            # constructor, maybe a multiplier + offset is a more
+            # appropriate way of passing this logical grid
+            logical_img_hw = tuple(logical_img_hw)
+        self.logical_img_hw = logical_img_hw
+        if logical_kern_hw is not None:
+            h, w = logical_kern_hw
+            #TODO: reconsider this... since shapes are not given in
+            # constructor, maybe a multiplier + offset is a more
+            # appropriate way of passing this logical grid
+            logical_kern_hw = tuple(logical_kern_hw)
+        self.logical_kern_hw = logical_kern_hw
+        self.logical_kern_align_top = logical_kern_align_top
+        self.version = version
+        self.verbose = verbose
+        self.kshp = kshp
+        self.imshp = imshp
+        self.max_threads_dim0 = max_threads_dim0
+
+    def __eq__(self, other):
+        return type(self) == type(other) \
+            and self.border_mode == other.border_mode \
+            and self.subsample == other.subsample \
+            and self.logical_img_hw == other.logical_img_hw \
+            and self.logical_kern_hw == other.logical_kern_hw \
+            and self.logical_kern_align_top == other.logical_kern_align_top \
+            and self.version == other.version \
+            and self.verbose == other.verbose \
+            and self.kshp == other.kshp\
+            and self.imshp == other.imshp\
+            and self.max_threads_dim0 == other.max_threads_dim0
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, "imshp"):
+            self.imshp = None
+        if not hasattr(self, "max_threads_dim0"):
+            self.max_threads_dim0 = None
+
+    def __hash__(self):
+        # don't use hash(self.version) as hash(-1)==-2 and
+        # hash(-2)==-2 in python!
+        return hash(type(self)) \
+            ^ hash(self.border_mode) \
+            ^ hash(self.subsample) \
+            ^ hash(self.logical_img_hw) \
+            ^ hash(self.logical_kern_hw) \
+            ^ hash(self.logical_kern_align_top) \
+            ^ self.version \
+            ^ hash(self.verbose) \
+            ^ hash(self.kshp)\
+            ^ hash(self.imshp)\
+            ^ hash(self.max_threads_dim0)
+
+    def __str__(self):
+        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            str(self.logical_img_hw),
+            str(self.logical_kern_hw),
+            str(self.logical_kern_align_top),
+            str(self.imshp),
+            str(self.kshp))
+
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False]
+        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+    def flops(self, inputs, outputs):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        images, kerns = inputs
+        out, = outputs
+        assert images[1] == kerns[1]
+        flops = 0
+        if self.border_mode == "valid":
+            # nb mul and add by output pixel
+            flops = kerns[2] * kerns[3] * 2
+            # nb flops by output image
+            flops *= out[2] * out[3]
+            # nb patch multiplied
+            flops *= images[1] * kerns[0] * images[0]
+        else:
+            flops = (images[0] * kerns[0] * images[1] *
+                     kerns[2] * kerns[3] *
+                     images[2] * images[3] * 2)
+        return flops
+
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        node_ = copy.copy(node)
+        assert node.op is node_.op
+        if node_.op.max_threads_dim0 is None:
+            cuda = theano.sandbox.cuda
+            device_id = cuda.use.device_number
+            if device_id is None:
+                cuda.use("gpu",
+                         force=False,
+                         default_to_move_computation_to_gpu=False,
+                         move_shared_float32_to_gpu=False,
+                         enable_cuda=False,
+                         test_driver=True)
+                device_id = cuda.use.device_number
+            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+            prop = cuda_ndarray.device_properties(device_id)
+            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
+                                                   compute_map, no_recycling)
+
+    def c_compile_args(self):
+        nb = 0
+        if self.kshp is not None:
+            nb = self.kshp[1]
+        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']
+
+    def c_headers(self):
+        return ['cuda_ndarray.cuh', '<stdio.h>']
+
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (0, 20)
+
+    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of
+        # these files
+        files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
+        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                for f in files]
+        return reduce(str.__add__, codes)
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        img, kern = inp
+        out, = out_
+        dx = self.subsample[0]
+        dy = self.subsample[1]
+        border_mode = self.border_mode
+        version = self.version
+        verbose = self.verbose
+        sub = sub.copy()
+        max_threads_dim0 = self.max_threads_dim0
+        if max_threads_dim0 is None:
+            raise NotImplementedError("GpuConv.c_code should not be called "
+                                      "directly. It should be called by "
+                                      "make_thunk() that add some information "
+                                      "related to the selected GPU.")
+        sub.update(locals())
+        return """
+    //Mandatory args
+    const char *mode_str = "%(border_mode)s";
+
+    //Optional args
+    int version = %(version)s;
+    int verbose = %(verbose)s;
+    int dx = %(dx)s;
+    int dy = %(dy)s;
+
+    int mode;
+    if (strcmp(mode_str, "full") == 0)
+    {
+        mode = ConvMode_FULL;
+    }
+    else if (strcmp(mode_str, "valid") == 0)
+    {
+        mode = ConvMode_VALID;
+    }
+    else
+    {
+        PyErr_SetString(PyExc_ValueError,
+                        "mode must be one of 'full' or 'valid'");
+        return NULL;
+    }
+
+    // TODO, make out be decref before we alloc out2!
+    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
+                                                         %(out)s, mode,
+                                                         dx, dy,
+                                                         version, verbose,
+                                                         %(max_threads_dim0)s);
+    Py_XDECREF(%(out)s);
+    %(out)s = out2;
+
+    if (%(out)s==NULL){
+        %(fail)s
+    }
+""" % sub
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -9,8 +9,8 @@ from theano.gof import (local_optimizer, EquilibriumDB,
                        InconsistencyError, EquilibriumOptimizer)

 from theano.gof.python25 import all, any
+from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
-
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               gpu_from_host,
                                               gpu_alloc,
@@ -20,6 +20,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               GpuReshape,
                                               GpuEye)
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
+from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                                          GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
@@ -372,7 +373,7 @@ def local_gpu_conv(node):
    if node.op == gpu_from_host:
        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
        host_input = node.inputs[0]
-        if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
+        if host_input.owner and isinstance(host_input.owner.op, ConvOp):
            gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
            if gpu_conv is None:
                return
@@ -386,7 +387,7 @@ def local_gpu_conv(node):
            # differently then the gpu ConvOp
            return [out]

-    if isinstance(node.op, conv.ConvOp):
+    if isinstance(node.op, ConvOp):
        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
        img, kern = node.inputs
        img_on_gpu = (img.owner and img.owner.op == host_from_gpu)

--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+"""
+Tests for GPU convolution
+"""
+import sys
+import time
+import unittest
+
+
+import numpy
+
+from nose.plugins.skip import SkipTest
+imported_scipy_convolve2d = False
+try:
+    from scipy.signal import convolve2d
+    imported_scipy_convolve2d = True
+except ImportError:
+    pass
+
+import theano
+from theano import tensor
+from theano.gof.python25 import any
+from theano.tests.unittest_tools import seed_rng
+
+# We let that import do the init of the back-end if needed.
+from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
+                                                          mode_without_gpu)
+from theano.sandbox.gpuarray.type import GpuArrayType
+import pygpu
+gftensor4 = GpuArrayType('float32', [False] * 4)
+
+device_id = theano.sandbox.cuda.use.device_number
+# TODO do with with the new back-end.
+from theano.sandbox.cuda import cuda_ndarray
+cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+device_prop = cuda_ndarray.device_properties(device_id)
+
+
+def py_conv_valid_numpy(img, kern):
+    assert img.shape[1] == kern.shape[1]
+    outshp = (img.shape[0], kern.shape[0],
+            img.shape[2] - kern.shape[2] + 1,
+            img.shape[3] - kern.shape[3] + 1)
+    out = numpy.zeros(outshp, dtype='float32')
+    for b in xrange(out.shape[0]):
+        for k in xrange(out.shape[1]):
+            for rr in xrange(out.shape[2]):
+                for cc in xrange(out.shape[3]):
+                    #rr, cc is the upper-left corner of img patches
+                    imgpatch = img[b, :, rr:rr + kern.shape[2],
+                                   cc:cc + kern.shape[3]]
+
+                    innerprod = (imgpatch[:, ::-1, ::-1] *
+                                 kern[k, :, :, :]).sum()
+                    out[b, k, rr, cc] = innerprod
+    return out
+
+
+def py_conv_full_numpy(img, kern):
+    # manually pad the img with zeros all around, and then run it
+    # through py_conv_valid
+    pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
+    pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
+    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
+                             dtype=img.dtype)
+    padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
+                     kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
+    return py_conv_valid_numpy(padded_img, kern)
+
+
+def py_conv(img, kern, mode, subsample):
+    """
+    use a scipy or numpy implementation depending is scipy is available.
+    The scipy version is faster.
+    """
+    if imported_scipy_convolve2d:
+        return py_conv_scipy(img, kern, mode, subsample)
+    elif mode == 'valid':
+        return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
+                                                      ::subsample[1]]
+    elif mode == 'full':
+        return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
+                                                     ::subsample[1]]
+    else:
+        raise Exception("Can't execute this kernel.")
+
+
+def py_conv_scipy(img, kern, mode, subsample):
+    assert img.shape[1] == kern.shape[1]
+    if mode == 'valid':
+        outshp = (img.shape[0], kern.shape[0],
+                img.shape[2] - kern.shape[2] + 1,
+                img.shape[3] - kern.shape[3] + 1)
+    else:
+        outshp = (img.shape[0], kern.shape[0],
+                img.shape[2] + kern.shape[2] - 1,
+                img.shape[3] + kern.shape[3] - 1)
+    out = numpy.zeros(outshp, dtype='float32')
+    for b in xrange(out.shape[0]):
+        for k in xrange(out.shape[1]):
+            for s in xrange(img.shape[1]):
+                out[b, k, :, :] += convolve2d(img[b, s, :, :],
+                                              kern[k, s, :, :],
+                                              mode)
+    return out[:, :, ::subsample[0], ::subsample[1]]
+
+
+def _params_allgood_header():
+    print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
+
+
+def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
+                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
+                    print_=None, id=None, rtol=1e-5, atol=1e-8,
+                    nb_iter=0, ones=False, compile_kshp=None):
+    #
+    # This function is the core of several of the big unit-test drivers,
+    # but it can also be used very directly on its own to test a specific
+    # kind of convolution.
+    #
+    # See `test_example` (above) for an example of how to use this directly.
+    #
+    # :param kshape: (4d)The shape of the kernel at run time.
+    # :param compile_kshp: (2d) hardcode the shape of the kernel in
+    #                      the generated code This is supposed to be
+    #                      faster, but we need to check That we raise
+    #                      an error if the input have the wrong shape.
+    #
+    if ones:
+        assert not random
+        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
+        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
+    elif random:
+        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
+                                  dtype='float32')
+        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
+                                   dtype='float32')
+    else:
+        npy_img = theano._asarray(numpy.arange(
+                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
+        npy_kern = -(theano._asarray(numpy.arange(
+                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
+    img = pygpu.array(npy_img)
+    kern = pygpu.array(npy_kern)
+
+    #we take the stride after the transfert as we make c_contiguous
+    #data on the GPU.
+    if img_stride != (1, 1):
+        img = img[:, :, ::img_stride[0], ::img_stride[1]]
+        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
+    if kern_stride != (1, 1):
+        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
+        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
+
+    t2 = None
+    rval = True
+    try:
+        t0 = time.time()
+        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
+        t1 = time.time()
+        i = gftensor4()
+        k = gftensor4()
+        op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
+                                              subsample=subsample,
+                                              version=version,
+                                              verbose=verbose,
+                                              kshp=compile_kshp)(i, k)
+        f = theano.function([i, k], op, mode=mode_with_gpu)
+        gpuval = f(img, kern)
+        t2 = time.time()
+        for i in range(nb_iter):
+            gpuval2 = f(img, kern)
+            assert numpy.allclose(numpy.asarray(gpuval),
+                                  numpy.asarray(gpuval2))
+            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
+        gpuval = numpy.asarray(gpuval)
+        if gpuval.shape != cpuval.shape:
+            print >> sys.stdout, "ERROR: shape mismatch",
+            print >> sys.stdout, gpuval.shape, cpuval.shape
+            rval = False
+        if rval:
+            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
+            assert numpy.all(numpy.isfinite(gpuval))
+    except NotImplementedError, e:
+        print >> sys.stdout, '_params_allgood Failed allclose', e
+        rval = False
+
+    if (t2 is not None):
+        if mode == 'valid':
+            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
+        else:
+            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
+                         kshape[3] * ishape[2] * ishape[3] * 2)
+        approx_fp /= 1e6
+        cpu_mflops = approx_fp / (t1 - t0)
+        gpu_mflops = approx_fp / (t2 - t1)
+        if verbose > 0:
+            print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
+            print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
+                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
+    if not rval:
+        print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
+                              ' FAILED for ishape, kshape, mode, subsample,' +
+                              ' img_stride, kern_stride, version', ishape,
+                              kshape, mode, subsample, img_stride, kern_stride,
+                              version)
+        diff = cpuval - gpuval
+        diffabs = numpy.absolute(diff)
+        pr_diff = diffabs / numpy.absolute(cpuval)
+        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
+        print "max absolute diff:", (diffabs.max(), "avg abs diff:",
+                                     numpy.average(diffabs))
+        print "median abs diff:", (numpy.median(diffabs), "nb close:",
+                                   nb_close, "/", diff.size)
+        print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
+                                    numpy.average(pr_diff))
+    if not rval and print_ != False:
+        if npy_img.shape[0] > 5:
+            print "img", npy_img[0]
+            print "kern", npy_kern[0]
+            print "gpu", gpuval[0][0]
+            print "cpu", cpuval[0][0]
+            print "diff", diff[0][0]
+        else:
+            print "img", npy_img
+            print "kern", npy_kern
+            print "gpu", gpuval
+            print "cpu", cpuval
+            print "diff", diff
+
+    return rval
+
+
+def exec_conv(version, shapes, verbose, random, mode,
+              print_=None, rtol=1e-5, ones=False):
+    if verbose > 0:
+        _params_allgood_header()
+    nb_failed = 0
+    nb_tests = 0
+
+    failed_version = set()
+    failed_id = []
+    # I put -1 in case we forget to add version in the test to.
+    for ver in version:
+        for id, (ishape, kshape, subshape,
+                 istride, kstride) in enumerate(shapes):
+            ret = False
+            try:
+                ret = _params_allgood(ishape,
+                        kshape,
+                        mode,
+                        subsample=subshape,
+                        img_stride=istride,
+                        kern_stride=kstride,
+                        version=ver,
+                        verbose=verbose,
+                        random=random,
+                        id=id,
+                        print_=print_,
+                        rtol=rtol,
+                        ones=ones)
+            except Exception, e:
+                print ver, id, (ishape, kshape, subshape, istride, kstride)
+                print e
+                pass
+            if not ret:
+                failed_version.add(ver)
+                failed_id.append(id)
+                nb_failed += 1
+            nb_tests += 1
+    if nb_failed > 0:
+        print "nb_failed", nb_failed, "on", nb_tests,
+        print "failed_version", failed_version, "failed_id", failed_id
+        assert nb_failed == 0, nb_failed
+    else:
+        print 'Executed', nb_tests, 'different shapes'
+
+
+def get_basic_shapes():
+        #basic test of image and kernel shape
+    return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
+            ((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+            ((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+        #basic test for unsquare kernel and image
+            ((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+            ((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+            ((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
+            ((1, 1, 4, 4), (1, 1, 3, 2), (1, 1), (1, 1), (1, 1)),
+            ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))]
+
+
+def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
+               img_stride=(1, 1), kern_stride=(1, 1)):
+    """ all possible case if we one or more of stack size, batch size,
+    nkern. We use the gived image shape, kernel shape and subsmaple
+    shape."""
+    return [
+        #stack only
+        ((1, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
+        #batch only
+        ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
+        #nkern only
+        ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
+        #batch and nkern
+        ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
+        #batch and stack
+        ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
+        #stack and nkern
+        ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
+        #batch, nkern and stack
+        ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
+        #batch, nkern and stack
+        ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
+    ]
+
+
+def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
+                img_stride=(1, 1), kern_stride=(1, 1)):
+    #basic test of stack, batch and nkern paramter
+    shapes = get_shapes((1 * scales_img[0], 1 * scales_img[1]),
+                        (1 * scales_kern[0], 1 * scales_kern[1]),
+                        subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with image and kernel shape
+    shapes += get_shapes((2 * scales_img[0], 2 * scales_img[1]),
+                         (2 * scales_kern[0], 2 * scales_kern[1]),
+                         subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with image and kernel shape
+    shapes += get_shapes((3 * scales_img[0], 3 * scales_img[1]),
+                         (2 * scales_kern[0], 2 * scales_kern[1]),
+                         subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square image.
+    shapes += get_shapes((4 * scales_img[0], 3 * scales_img[1]),
+                         (2 * scales_kern[0], 2 * scales_kern[1]),
+                         subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square image.
+    shapes += get_shapes((3 * scales_img[0], 4 * scales_img[1]),
+                         (2 * scales_kern[0], 2 * scales_kern[1]),
+                         subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square kernel.
+    shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
+                         (3 * scales_kern[0], 2 * scales_kern[1]),
+                         subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square kernel.
+    shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
+                         (2 * scales_kern[0], 3 * scales_kern[1]),
+                         subsample, img_stride, kern_stride)
+    return shapes
+
+
+def get_valid_shapes():
+
+    #          img shape,     kern shape, subsample shape
+
+    shapes = get_basic_shapes()
+    shapes += get_shapes2()
+
+    #test image stride
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
+    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
+
+    #test subsample done in a separate fct
+
+    shapes += [
+         #other test
+              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize
+            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image
+            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image, non-square kern
+            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
+            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
+            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
+            , ((60,20,28,28), (10,20,5,5), (1, 1), (2,2), (1, 1))#added a test case that fail from test_nnet.py.test_conv_nnet2
+            , ((10,5,28,28), (10,5,5,5), (1, 1), (2,2), (1, 1))#test precedent but reduced that triger the error
+            #Test more than maxThreadsDim0
+            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            ]
+
+    shapes += [ ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
+            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
+            , ((60,30,8,8),(20,30,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 1 full
+            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
+            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
+            , ((10,30,23,23),(20,30,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 full
+#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
+#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
+            ]
+    return shapes
+
+
+def test_valid_0_2():
+    seed_rng()
+    shapes = get_valid_shapes()
+    version = [0, 2]
+    verbose = 0
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    shapes2 = []
+
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > device_prop['maxThreadsDim0']:
+            continue
+        if ishape[1] > 1:
+            continue
+        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
+            (16 * 1024 - 150)):
+            continue
+        if subshape == (1, 1):
+            shapes2.append((ishape, kshape, subshape, istride, kstride))
+    shapes = shapes2
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_valid_1_3_11_12():
+    seed_rng()
+    shapes = get_valid_shapes()
+    version = [1, 3, 11, 12]
+    verbose = 0
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    shapes2 = []
+
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > device_prop['maxThreadsDim0']:
+            continue
+        if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
+            (16 * 1024 - 150)):
+            continue
+        if subshape == (1, 1):
+            shapes2.append((ishape, kshape, subshape, istride, kstride))
+    shapes = shapes2
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_valid_4():
+    seed_rng()
+    shapes = get_valid_shapes()
+    version = [4]
+    verbose = 0
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    shapes2 = []
+
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > device_prop['maxThreadsDim0']:
+            continue
+        if ishape[1] > 1:
+            continue
+        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
+            (16 * 1024 - 150)):
+            continue
+        if subshape == (1, 1):
+            shapes2.append((ishape, kshape, subshape, istride, kstride))
+    shapes = shapes2
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_valid_5():
+    seed_rng()
+    shapes = get_valid_shapes()
+    version = [5]
+    verbose = 0
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    shapes2 = []
+
+#    print len(shapes)
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > device_prop['maxThreadsDim0']:
+            continue
+        if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
+            (16 * 1024 - 150)):
+            continue
+        if subshape == (1, 1):
+            shapes2.append((ishape, kshape, subshape, istride, kstride))
+    shapes = shapes2
+#    print len(shapes2)
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_valid_7_8_13():
+    seed_rng()
+    shapes = get_valid_shapes()
+    # This is to test the "new" lower shared memory usage.
+    shapes.append(((10, 30, 60, 60), (20, 30, 40, 40),
+                   (1, 1), (1, 1), (1, 1)))
+    version = [7, 8, 13]
+    verbose = 0
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    shapes2 = []
+
+#    print len(shapes)
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[2] * oshape[3] > device_prop['maxThreadsDim0']:
+            continue
+        if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
+               oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
+            continue
+        if subshape == (1, 1):
+            shapes2.append((ishape, kshape, subshape, istride, kstride))
+    shapes = shapes2
+#    print len(shapes2)
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_valid_9_10():
+    seed_rng()
+    shapes = get_valid_shapes()
+    version = [9, 10]
+    verbose = 0
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    shapes2 = []
+
+#    print len(shapes)
+    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
+                                                  numpy.asarray(kshape[2:]) +
+                                                  numpy.asarray([1, 1]))
+        if oshape[3] > device_prop['maxThreadsDim0']:
+            continue
+        if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
+            continue
+        if subshape == (1, 1):
+            shapes2.append((ishape, kshape, subshape, istride, kstride))
+    shapes = shapes2
+#    print len(shapes2)
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_valid():
+    seed_rng()
+    shapes = get_valid_shapes()
+
+    #shapes=shapes[400:426]
+    # I put -1 in case we forget to add version in the test to.
+    # I put -2 to test the reference version.
+    version = [-2, -1, 6]
+    verbose = 0
+#    version=[1]
+
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+
+    exec_conv(version, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones, rtol=1.1e-5)
+
+
+def test_full():
+    seed_rng()
+    shapes = get_basic_shapes()
+    shapes += get_shapes2()
+    #test image stride
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
+    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
+    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
+
+    #test subsample done in a separate fct
+
+    shapes += [
+        #other test
+              ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+            , ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
+            , ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+            , ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
+            , ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize
+            , ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image
+            , ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize, non-square image, non-square kern
+            , ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
+            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
+            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
+
+        #other test
+            , ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))#kernel bigger then image
+            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
+            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
+            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))#kernel bigger then image
+            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
+            ]
+    shapes += [
+#        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
+#            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
+             ((60,30,8,8),(20,30,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 1 full
+#            , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+#            , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
+#            , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
+#            , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
+            , ((10,30,23,23),(20,30,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 full
+#            , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
+#            , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
+            #Test more than maxThreadsDim0
+            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            ]
+
+#    shapes=shapes[:277]
+    version = [-2, -1, 0, 1, 2, 3, 4, 5]
+    verbose = 0
+#    version=[4]
+    random = True
+
+    exec_conv(version, shapes, verbose, random, 'full')
+
+
+def test_subsample():
+    seed_rng()
+    # implement when
+    shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
+              ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
+              ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
+              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
+              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
+          ]
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
+    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
+
+#We put only the version that implement the subsample to make the test faster.
+    version_valid = [-2, -1, 1, 3, 11, 12]
+    version_full = [-2, -1]
+    verbose = 0
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+
+    exec_conv(version_valid, shapes, verbose, random, 'valid',
+              print_=print_, ones=ones)
+    exec_conv(version_full, shapes, verbose, random, 'full',
+              print_=print_, ones=ones)
+
+
+class TestConv2DGPU(unittest.TestCase):
+    def test_logical_shapes(self):
+        seed_rng()
+        for stride in range(1, 4):
+            kshp = (10, 2, 10, 10)
+            featshp = (3, 10, 11, 11)
+
+            a = tensor.ftensor4()
+            A = tensor.ftensor4()
+
+            # Need to transpose first two dimensions of kernel, and reverse
+            # index kernel image dims (for correlation)
+            kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3])
+
+            featshp_logical = (featshp[0], featshp[1], featshp[2] * stride,
+                               featshp[3] * stride)
+            kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3])
+            #print featshp, kshp_rotated, featshp_logical[1:], kshp[2:]
+            image_estimate = tensor.nnet.conv2d(a, kernel_rotated,
+                                                border_mode='full',
+                                                image_shape=featshp,
+                                                filter_shape=kshp_rotated,
+                                                imshp_logical=featshp_logical[1:],
+                                                kshp_logical=kshp[2:])
+
+            func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
+            #theano.printing.debugprint(func,)
+            assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv)
+                        for node in func.maker.fgraph.toposort()])
+
+            a_in = numpy.random.randn(*featshp).astype("float32")
+            A_in = numpy.random.randn(*kshp).astype("float32")
+
+            func(a_in, A_in)
+
+    def test_invalid_input_shape(self):
+        """
+        Tests that when the shape gived at build time is not the same as
+        run time we raise an error
+        """
+        seed_rng()
+        verbose = 0
+        random = True
+        print_ = False
+        ones = False
+        if ones:
+            random = False
+
+        global mode_with_gpu
+        mode_with_gpu_orig = mode_with_gpu
+        try:
+            if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
+                mode_with_gpu = theano.compile.mode.get_mode(
+                    'FAST_RUN').including('gpu')
+                for mode in ['valid', 'full']:
+                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
+                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
+                                   #((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
+                                   # We use only the number of columns.
+                                   ]:
+
+                        self.assertRaises(ValueError, _params_allgood,
+                                          shapes[0], shapes[1],
+                                          verbose=verbose, random=random,
+                                          mode=mode,
+                                          print_=print_, ones=ones,
+                                          compile_kshp=shapes[2])
+        finally:
+            mode_with_gpu = mode_with_gpu_orig
+
+
+def benchmark():
+
+    shapes_valid = [
+        #test_lenet_28 shape
+        ((20, 60,12,12), (30,60,8,8), (1, 1), (1, 1), (1, 1))#valid
+        ,((60, 20,12,12), (30,20,5,5), (1, 1), (1, 1), (1, 1))#valid
+        ,((60, 1,28,28), (20,1,5,5), (1, 1), (1, 1), (1, 1))#valid
+        ,((1, 60,28,28), (20,60,24,24), (1, 1), (1, 1), (1, 1))#valid
+        #test_lenet_32 shape
+        ,((20, 60,14,14), (30,60,10,10), (1, 1), (1, 1), (1, 1))#valid
+        ,((60, 20,14,14), (30,20,5,5), (1, 1), (1, 1), (1, 1))#valid
+        ,((60, 1,32,32), (20,1,5,5), (1, 1), (1, 1), (1, 1))#valid
+        ,((1, 60,32,32), (20,60,28,28), (1, 1), (1, 1), (1, 1))#valid
+        #test_lenet_64 shape
+        ,((10, 20,29,29), (30,20,7,7), (1, 1), (1, 1), (1, 1))#valid
+        ,((20, 10,29,29), (30,10,23,23), (1, 1), (1, 1), (1, 1))#valid
+        ,((10, 1,64,64), (20,1,7,7), (1, 1), (1, 1), (1, 1))#valid
+        ,((1, 10,64,64), (20,10,58,58), (1, 1), (1, 1), (1, 1))#valid
+        #test_lenet_108 shape
+        ,((10, 20,51,51), (30,20,7,7), (1, 1), (1, 1), (1, 1))#valid
+        ,((20, 10,51,51), (30,10,45,45), (1, 1), (1, 1), (1, 1))#valid
+        ,((10, 1,108,108), (20,1,7,7), (1, 1), (1, 1), (1, 1))#valid
+        ,((1, 10,108,108), (20,10,102,102), (1, 1), (1, 1), (1, 1))#valid
+        #test_lenet_256 shape
+        ,((2, 20,124,124), (30,20,9,9), (1, 1), (1, 1), (1, 1))#valid
+        ,((20, 2,124,124), (30,2,116,116), (1, 1), (1, 1), (1, 1))#valid
+        ,((2, 1,256,256), (20,1,9,9), (1, 1), (1, 1), (1, 1))#valid
+        ,((1, 2,256,256), (20,2,248,248), (1, 1), (1, 1), (1, 1))#valid
+            ]
+
+    shapes_full = [
+        #test_lenet_28 shape
+         ((60, 30,8,8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))#full
+        #test_lenet_32 shape
+         ,((60, 30,10,10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1))#full conv_full_patch_stack_padded' N=1
+        #test_lenet_64 shape
+         ,((10, 30,23,23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))#full conv_full_patch_stack_padded' N=3
+        #test_lenet_108 shape
+         ,((10, 30,45,45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1))#full 'conv_full_patch_stack_padded' N=9
+        #test_lenet_256 shape
+         ,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full
+            ]
+
+#    shapes_valid=shapes_valid[-1:]
+#    shapes_full=shapes_full[-1:]
+    version = [-1]
+    verbose = 1
+    random = True
+
+    exec_conv(version, shapes_valid, verbose, random, 'valid',
+              print_=None, rtol=1e-3)
+    exec_conv(version, shapes_full, verbose, random, 'full')
+
+
+def test_stack_rows_segfault_070312():
+    seed_rng()
+    # 07/03/2012
+    # Running this unittest with cuda-memcheck exposes an illegal read.
+    # THEANO_FLAGS=device=gpu cuda-memcheck nosetests \
+    # test_conv_cuda_ndarray.py:test_stack_rows_segfault_070312
+    img = theano.shared(numpy.random.rand(1, 80, 96, 96).astype('float32'))
+    kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
+    out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
+    op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
+            nkern=1, bsize=1)
+    f = theano.function([], [], updates=[(out, op(img, kern))], mode=mode_with_gpu)
+    f()