Merge pull request #2112 from nouiz/abergeron-dnn

Abergeron dnn

Merge pull request #2112 from nouiz/abergeron-dnn
50968f7a · Frédéric Bastien · fa1d9c4a · 5b796440 · 50968f7a · 50968f7a
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -99,7 +99,7 @@ There are less methods to define for an Op than for a Type:
       module is initialized, before anything else is executed and is
       specialized for a particular apply of an :ref:`op`.

-    .. method:: c_init_code_struct(node, struct_id)
+    .. method:: c_init_code_struct(node, struct_id, sub)

       Allows you to specify code that will be inserted in the struct
       constructor of the Op.  This is for code which should be
@@ -108,6 +108,16 @@ There are less methods to define for an Op than for a Type:
       `struct_id` is an integer guaranteed to be unique inside the
       struct.

+       `sub` is a dictionary of extras parameters to the
+       c_code_init_code_struct method. It contains the following
+       values:
+
+       ``sub['fail']``
+
+          A string of code that you should execute (after ensuring
+          that a python exception is set) if your C code needs to
+          raise an exception.
+
    .. method:: c_support_code()

       Allows you to specify helper functions/structs that the

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -94,6 +94,13 @@ TODO: Give examples on how to use these things! They are pretty complicated.

          f = theano.function(..., mode=mode)

+    - :func:`GpuDnnConv <theano.sandbox.cuda.dnn.GpuDnnConv>` GPU-only
+      convolution using NVIDIA's cuDNN library.  To enable it (and
+      other cudnn-acclerated ops), set
+      ``THEANO_FLAGS=optimizer_including=cudnn`` in your environment.
+      This requires that you have cuDNN installed and available.  It
+      also requires a GPU with compute capability 3.0 or more.
+
    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
      3D Convolution applying multi-channel 3D filters to batches of
      multi-channel 3D images.

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -143,6 +143,18 @@ def failure_code(sub):
        goto __label_%(id)i;}''' % sub


+def failure_code_init(sub):
+    "Code for failure in the struct init."
+    return '''{
+        if (!PyErr_Occurred()) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Unexpected error in an Op's C code. "
+                "No Python exception was set.");
+            }
+        return %(id)d;
+}''' % sub
+
+
 def code_gen(blocks):
    """WRITEME From a list of L{CodeBlock} instances, returns a string
    that executes them all in sequence. eg for C{(decl1, task1,
@@ -205,8 +217,7 @@ def struct_gen(args, struct_builders, blocks, sub):
        #     be executed if any step in the constructor fails and the
        #     latter only at destruction time.
        struct_decl += block.declare
-        struct_init_head = struct_init_head + ("\n{\n%s" % block.behavior)
-        struct_init_tail = ("%s\n}\n" % block.cleanup) + struct_init_tail
+        struct_init_head = struct_init_head + ("\n%s" % block.behavior)
        struct_cleanup += block.cleanup

    behavior = code_gen(blocks)
@@ -258,6 +269,7 @@ def struct_gen(args, struct_builders, blocks, sub):

    # TODO: add some error checking to make sure storage_<x> are
    # 1-element lists and __ERROR is a 3-elements list.
+
    struct_code = """
    namespace {
    struct %(name)s {
@@ -274,13 +286,9 @@ def struct_gen(args, struct_builders, blocks, sub):
        int init(PyObject* __ERROR, %(args_decl)s) {
            %(storage_incref)s
            %(storage_set)s
-            int %(failure_var)s = 0;
            %(struct_init_head)s
            this->__ERROR = __ERROR;
            return 0;
-            %(struct_init_tail)s
-            %(storage_decref)s
-            %(do_return)s
        }
        void cleanup(void) {
            %(struct_cleanup)s
@@ -333,7 +341,7 @@ def get_c_init(r, name, sub):

 def get_c_extract(r, name, sub):
    """Wrapper around c_extract that initializes py_name from storage."""
-    if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in 
+    if any([getattr(c.op, 'check_input', config.check_input) for (c, _) in
            r.clients]):

        c_extract = r.type.c_extract(name, sub, True)
@@ -419,7 +427,7 @@ def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
    sub = dict(sub)
 #    sub['name'] = name
    sub['id'] = id
-    sub['fail'] = failure_code(sub)
+    sub['fail'] = failure_code_init(sub)
    sub['py_ptr'] = "py_%s" % name
    sub['stor_ptr'] = "storage_%s" % name
    # struct_declare, struct_behavior, struct_cleanup, sub)
@@ -530,9 +538,8 @@ class CLinker(link.Linker):
        failure_var = "__failure"
        id = 1

-        sub = dict(failure_var=failure_var)
-
        for variable in self.variables:
+            sub = dict(failure_var=failure_var)

            # it might be possible to inline constant variables as C literals
            # policy = [[what to declare in the struct,
@@ -634,6 +641,10 @@ class CLinker(link.Linker):
            sub['struct_id'] = id + 1
            sub['fail'] = failure_code(sub)

+            sub_struct = dict()
+            sub_struct['id'] = id + 1
+            sub_struct['fail'] = failure_code_init(sub)
+
            struct_support = ""
            struct_init = ""
            struct_cleanup = ""
@@ -661,7 +672,7 @@ class CLinker(link.Linker):
                    " didn't return a string for c_init_code_apply")

            try:
-                struct_init = op.c_init_code_struct(node, id + 1)
+                struct_init = op.c_init_code_struct(node, id + 1, sub_struct)
                assert isinstance(struct_init, basestring), (
                    str(node.op) +
                    " didn't return a string for c_init_code_struct")
@@ -1418,7 +1429,10 @@ class CLinker(link.Linker):
        print >> code, '     return NULL;'
        print >> code, '  }'
        print >> code, '  %(struct_name)s* struct_ptr = new %(struct_name)s();' % locals()
-        print >> code, '  struct_ptr->init(', ','.join('PyTuple_GET_ITEM(argtuple, %i)' % n for n in xrange(n_args)), ');'
+        print >> code, '  if (struct_ptr->init(', ','.join('PyTuple_GET_ITEM(argtuple, %i)' % n for n in xrange(n_args)), ') != 0) {'
+        print >> code, '    delete struct_ptr;'
+        print >> code, '    return NULL;'
+        print >> code, '  }'
        if PY3:
            print >> code, """\
    PyObject* thunk = PyCapsule_New((void*)(&{struct_name}_executor), NULL, {struct_name}_destructor);

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -319,7 +319,7 @@ class CLinkerOp(CLinkerObject):
        raise utils.MethodNotDefined("c_init_code_apply", type(self),
                                     self.__class__.__name__)

-    def c_init_code_struct(self, node, struct_id):
+    def c_init_code_struct(self, node, struct_id, sub):
        """
        Optional: return a code string specific to the apply
        to be inserted in the struct initialization code.
@@ -331,6 +331,11 @@ class CLinkerOp(CLinkerObject):
                          sub parameter named struct_id that will
                          contain this name.

+        :param sub: a dictionary of values to substitute in the code.
+                    Most notably it contains a 'fail' entry that you
+                    should place in your code after setting a python
+                    exception to indicate an error.
+
        :Exceptions:
         - `MethodNotDefined`: the subclass does not override this method
        """

--- a/theano/gof/tests/test_op.py
+++ b/theano/gof/tests/test_op.py
@@ -89,7 +89,7 @@ class StructOp(Op):
    def c_support_code_struct(self, node, struct_id):
        return "npy_uint64 counter%d;" % (struct_id,)

-    def c_init_code_struct(self, node, struct_id):
+    def c_init_code_struct(self, node, struct_id, sub):
        return "counter%d = 0;" % (struct_id,)

    def c_code(self, node, name, input_names, outputs_names, sub):

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -155,7 +155,8 @@ if compile_cuda_ndarray and cuda_available:
                            'cuda_ndarray',
                            code,
                            location=cuda_ndarray_loc,
-                            include_dirs=[cuda_path], libs=[config.cublas.lib],
+                            include_dirs=[cuda_path],
+                            libs=[config.cublas.lib],
                            preargs=['-O3'] + compiler.compile_args())
                    from cuda_ndarray.cuda_ndarray import *
            except Exception, e:
@@ -262,8 +263,8 @@ if cuda_available:

    shared_constructor = float32_shared_constructor

-    import basic_ops
-    from basic_ops import (
+    from . import basic_ops
+    from .basic_ops import (
            GpuFromHost, HostFromGpu, GpuElemwise,
            GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
            GpuSubtensor, GpuIncSubtensor,
@@ -273,11 +274,11 @@ if cuda_available:
            ftensor3, ftensor4,
            scalar, vector, matrix, row, col,
            tensor3, tensor4)
-    from basic_ops import (host_from_gpu, gpu_from_host,
+    from .basic_ops import (host_from_gpu, gpu_from_host,
            as_cuda_array, as_cuda_ndarray_variable)
-    import opt
    import cuda_ndarray
-    from rng_curand import CURAND_RandomStreams
+    from . import opt, dnn
+    from .rng_curand import CURAND_RandomStreams


 def use(device,

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
+#ifndef CUDNN_HELPER_H
+#define CUDNN_HELPER_H
+
+#include <cudnn.h>
+
+static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
+  switch (err) {
+  case CUDNN_STATUS_SUCCESS:
+    return "The operation completed successfully.";
+  case CUDNN_STATUS_NOT_INITIALIZED:
+    return "The handle was not initialized(Is your driver recent enought?).";
+  case CUDNN_STATUS_ALLOC_FAILED:
+    return "Ressource allocation failed inside the library.";
+  case CUDNN_STATUS_BAD_PARAM:
+    return "An incorrect value was passed in.";
+  case CUDNN_STATUS_ARCH_MISMATCH:
+    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
+  case CUDNN_STATUS_MAPPING_ERROR:
+    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
+  case CUDNN_STATUS_EXECUTION_FAILED:
+    return "A kernel failed to execute.";
+  case CUDNN_STATUS_INTERNAL_ERROR:
+    return "An internal cuDNN operation failed.";
+  case CUDNN_STATUS_NOT_SUPPORTED:
+    return "The combination of parameters is not currently supported.";
+  default:
+    return "Unknown error code.";
+  }
+}
+
+#endif
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
+import copy
+import os
+
+import theano
+from theano import Apply
+from theano import tensor
+from theano.compat.six import StringIO
+from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
+                                           gpu_contiguous)
+from theano.sandbox.cuda.blas import GpuConv
+
+class GpuDnnConv(GpuOp):
+    __props__ = ('border_mode',)
+
+    def __init__(self, border_mode):
+        self.border_mode = border_mode
+
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = (img.type.broadcastable[0],
+                         kern.type.broadcastable[0],
+                         False, False)
+
+        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h']
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+
+    def c_libraries(self):
+        return ['cudnn']
+
+    def c_support_code_struct(self, node, struct_id):
+        return """
+cudnnHandle_t handle%(id)d;
+cudnnTensor4dDescriptor_t input%(id)d;
+cudnnTensor4dDescriptor_t output%(id)d;
+cudnnFilterDescriptor_t kerns%(id)d;
+cudnnConvolutionDescriptor_t op%(id)d;
+""" % dict(id=struct_id)
+
+    def c_init_code_struct(self, node, struct_id, sub):
+        return """
+handle%(id)d = NULL;
+input%(id)d = NULL;
+output%(id)d = NULL;
+kerns%(id)d = NULL;
+op%(id)d = NULL;
+cudnnStatus_t err%(id)d;
+if ((err%(id)d = cudnnCreate(&handle%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
+               cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(inp): %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+               "(out): %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %%s",
+               cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
+               "descriptor: %%s", cudnnGetErrorString(err%(id)d));
+  %(fail)s
+}
+""" % dict(id=struct_id, fail=sub['fail'])
+
+    def c_cleanup_code_struct(self, node, struct_id):
+        return """
+cudnnDestroyTensor4dDescriptor(input%(id)d);
+cudnnDestroyTensor4dDescriptor(output%(id)d);
+cudnnDestroyFilterDescriptor(kerns%(id)d);
+cudnnDestroyConvolutionDescriptor(op%(id)d);
+cudnnDestroy(handle%(id)d);
+""" % dict(id=struct_id)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        img, kern = inputs
+        out, = outputs
+
+        if self.border_mode == "valid":
+            bmode = 1
+        else:
+            assert self.border_mode == "full"
+            bmode = 0
+
+        return """
+cudnnStatus_t err%(name)s;
+int pad_w%(name)s;
+int pad_h%(name)s;
+
+if (!CudaNdarray_is_c_contiguous(%(img)s)) {
+  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+  %(fail)s
+}
+
+if (!CudaNdarray_is_c_contiguous(%(kerns)s)) {
+  PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
+  %(fail)s
+}
+
+err%(name)s = cudnnSetTensor4dDescriptorEx(
+input%(id)d, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(img)s)[0],
+CudaNdarray_HOST_DIMS(%(img)s)[1],
+CudaNdarray_HOST_DIMS(%(img)s)[2],
+CudaNdarray_HOST_DIMS(%(img)s)[3],
+CudaNdarray_HOST_STRIDES(%(img)s)[0],
+CudaNdarray_HOST_STRIDES(%(img)s)[1],
+CudaNdarray_HOST_STRIDES(%(img)s)[2],
+CudaNdarray_HOST_STRIDES(%(img)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+err%(name)s = cudnnSetFilterDescriptor(
+kerns%(id)d, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(kerns)s)[0],
+CudaNdarray_HOST_DIMS(%(kerns)s)[1],
+CudaNdarray_HOST_DIMS(%(kerns)s)[2],
+CudaNdarray_HOST_DIMS(%(kerns)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if (%(bmode)d == 1) {
+  pad_h%(name)s = 0;
+  pad_w%(name)s = 0;
+} else if (%(bmode)d == 0) {
+  pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1;
+  pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
+} else {
+  PyErr_SetString(PyExc_ValueError, "bad border mode");
+  %(fail)s
+}
+err%(name)s = cudnnSetConvolutionDescriptor(
+op%(id)d, input%(id)d, kerns%(id)d, 
+pad_h%(name)s,
+pad_w%(name)s,
+1, 1, 1, 1,
+CUDNN_CONVOLUTION
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+{
+int out_dims[4];
+err%(name)s = cudnnGetOutputTensor4dDim(
+op%(id)d, CUDNN_CONVOLUTION_FWD,
+&out_dims[0], &out_dims[1],
+&out_dims[2], &out_dims[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
+  %(fail)s
+}
+}
+err%(name)s = cudnnSetTensor4dDescriptorEx(
+output%(id)d, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(out)s)[0],
+CudaNdarray_HOST_DIMS(%(out)s)[1],
+CudaNdarray_HOST_DIMS(%(out)s)[2],
+CudaNdarray_HOST_DIMS(%(out)s)[3],
+CudaNdarray_HOST_STRIDES(%(out)s)[0],
+CudaNdarray_HOST_STRIDES(%(out)s)[1],
+CudaNdarray_HOST_STRIDES(%(out)s)[2],
+CudaNdarray_HOST_STRIDES(%(out)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+err%(name)s = cudnnConvolutionForward(
+handle%(id)d,
+input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
+kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
+op%(id)d,
+output%(id)d, CudaNdarray_DEV_DATA(%(out)s),
+CUDNN_RESULT_NO_ACCUMULATE
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s",
+               cudnnGetErrorString(err%(name)s));
+  %(fail)s
+}
+""" % dict(img=img, kerns=kern, out=out, bmode=bmode,
+           fail=sub['fail'], id=sub['struct_id'], name=name)
+
+    def c_code_cache_version(self):
+        return (4,)
+
+
+from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
+                                     gpu_optimizer)
+
+@local_optimizer([GpuConv])
+def local_conv_dnn(node):
+    if isinstance(node.op, GpuConv):
+        if (node.op.subsample != (1, 1) or
+            node.op.border_mode not in ['full', 'valid']):
+            return
+        img, kern = node.inputs
+        border_mode = node.op.border_mode
+        return [GpuDnnConv(border_mode)(gpu_contiguous(img),
+                                        gpu_contiguous(kern))]
+
+gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -26,6 +26,8 @@ from theano.sandbox import cuda
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

+from theano.sandbox.cuda.dnn import GpuDnnConv
+
 #needed as the gpu conv don't have a perform implementation.
 if theano.config.mode == 'FAST_COMPILE':
    theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -615,14 +617,13 @@ def test_valid_9_10():
              print_=print_, ones=ones, rtol=1.1e-5)


-def test_valid(conv_gemm=False):
+def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
    seed_rng()
    shapes = get_valid_shapes()

    #shapes=shapes[400:426]
    # I put -1 in case we forget to add version in the test to.
    # I put -2 to test the reference version.
-    version = [-2, -1, 6]
    verbose = 0

    random = True
@@ -631,28 +632,31 @@ def test_valid(conv_gemm=False):
    if ones:
        random = False

-    if conv_gemm:
-        # Test the GpuCorrMM version
-        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.BaseGpuCorrMM
-        # dummy version; not used by GpuCorrMM so one version is enough
-        version = [-1]
-        # Add tests with strided inputs by still square images and filters.
-        shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-        shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
-    else:
-        mode = theano_mode
-        cls = None
+    shapes += extra_shapes
+
    exec_conv(version, shapes, verbose, random, 'valid',
              print_=print_, ones=ones, rtol=1.1e-5,
              theano_mode=mode, cls=cls)


+def test_valid():
+    _test_valid(None, version=[-2, -1, 6])
+
+
 def test_gemm_valid():
-    test_valid(conv_gemm=True)
+    extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
+    extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
+
+    _test_valid(cuda.blas.BaseGpuCorrMM,
+                mode=theano_mode.including("conv_gemm"),
+                extra_shapes=extra_shapes)
+

+def test_dnn_valid():
+    _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))

-def test_full(conv_gemm=False):
+
+def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
    seed_rng()
    shapes = get_basic_shapes()
    shapes += get_shapes2()
@@ -707,25 +711,26 @@ def test_full(conv_gemm=False):
            ]

 #    shapes=shapes[:277]
-    version = [-2, -1, 0, 1, 2, 3, 4, 5]
    verbose = 0
    random = True

-    if conv_gemm:
-        # Test the GpuCorrMM version
-        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.BaseGpuCorrMM
-        # dummy version; not used by GpuCorrMM so one version is enough
-        version = [-1]
-    else:
-        mode = theano_mode
-        cls = None
+    shapes += extra_shapes
+
    exec_conv(version, shapes, verbose, random, 'full',
              theano_mode=mode, cls=cls)


+def test_full():
+    _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
+
+
 def test_gemm_full():
-    test_full(conv_gemm=True)
+    _test_full(cuda.blas.BaseGpuCorrMM,
+               mode=theano_mode.including("conv_gemm"))
+
+
+def test_dnn_full():
+    _test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))


 def test_subsample(conv_gemm=False):