Merge pull request #2117 from abergeron/dnn

Add support for subsampling and gradient to GpuDnnConv

Merge pull request #2117 from abergeron/dnn
239b6d80 · Frédéric Bastien · 025d484e · dedc323d · 239b6d80 · 239b6d80
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,7 @@ global-include *.txt
 global-include *.c
 global-include *.cu
 global-include *.cuh
+global-include *.h
 global-include *.sh
 global-include *.pkl
 recursive-include docs

--- a/setup.py
+++ b/setup.py
@@ -193,7 +193,7 @@ def do_setup():
          install_requires=['numpy>=1.5.0', 'scipy>=0.7.2'],
          package_data={
              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
-                   'ChangeLog'],
+                   '*.h', 'ChangeLog'],
              'theano.misc': ['*.sh']
          },
          scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],

--- a/theano/gof/tests/test_types.py
+++ b/theano/gof/tests/test_types.py
+import numpy
+import theano

-from theano.gof.type import *
+from theano import Op, Apply
+from theano.tensor import TensorType
+
+from theano.gof.type import CDataType

 # todo: test generic
+
+class ProdOp(Op):
+    __props__ = ()
+
+    def make_node(self, i):
+        return Apply(self, [i], [CDataType('void *', 'py_decref')()])
+
+    def c_support_code(self):
+        return """
+void py_decref(void *p) {
+  Py_XDECREF((PyObject *)p);
+}
+"""
+
+    def c_code(self, node, name, inps, outs, sub):
+        return """
+Py_XDECREF(%(out)s);
+%(out)s = (void *)%(inp)s;
+Py_INCREF(%(inp)s);
+""" % dict(out=outs[0], inp=inps[0])
+
+    def c_code_cache_version(self):
+        return (0,)
+
+
+class GetOp(Op):
+    __props__ = ()
+
+    def make_node(self, c):
+        return Apply(self, [c], [TensorType('float32', (False,))()])
+
+    def c_support_code(self):
+        return """
+void py_decref(void *p) {
+  Py_XDECREF((PyObject *)p);
+}
+"""
+
+    def c_code(self, node, name, inps, outs, sub):
+        return """
+Py_XDECREF(%(out)s);
+%(out)s = (PyArrayObject *)%(inp)s;
+Py_INCREF(%(out)s);
+""" % dict(out=outs[0], inp=inps[0])
+
+    def c_code_cache_version(self):
+        return (0,)
+
+
+def test_cdata():
+    i = TensorType('float32', (False,))()
+    c = ProdOp()(i)
+    i2 = GetOp()(c)
+
+    # This should be a passthrough function for vectors
+    f = theano.function([i], i2)
+
+    v = numpy.random.randn(9).astype('float32')
+
+    v2 = f(v)
+    assert (v2 == v).all()
--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -2,6 +2,8 @@

 __docformat__ = "restructuredtext en"

+from theano.compat import PY3
+
 from theano.gof import utils
 from theano.gof.utils import MethodNotDefined, object2
 from theano.gof import graph
@@ -158,7 +160,7 @@ class CLinkerType(CLinkerObject):
                c_extract_code=self.c_extract(name, sub, check_input))

    def c_cleanup(self, name, sub):
-        """Optional: Return c code to clean up after `c_extract`.
+        """Return c code to clean up after `c_extract`.

        This returns C code that should deallocate whatever `c_extract`
        allocated or decrease the reference counts. Do not decrease
@@ -250,7 +252,7 @@ class PureType(object):
    # If filter_inplace is defined, it will be called instead of
    # filter() This is to allow reusing the old allocated memory. As
    # of this writing this is used only when we transfer new data to a
-    # shared variable on the gpu.  
+    # shared variable on the gpu.

    #def filter_inplace(value, storage, strict=False, allow_downcast=None)

@@ -470,3 +472,97 @@ class Generic(SingletonType):
        return self.__class__.__name__

 generic = Generic()
+
+
+class CDataType(Type):
+    """
+    Represents opaque C data to be passed around. The intent is to
+    ease passing arbitrary data between ops C code.
+    """
+    def __init__(self, ctype, freefunc=None):
+        """
+        Build a type made to represent a C pointer in theano.
+
+        :param ctype: The type of the pointer (complete with the `*`)
+
+        :param freefunc: a function to call to free the pointer.  This
+                         function must have a `void` return and take a
+                         single pointer argument.
+        """
+        assert isinstance(ctype, basestring)
+        self.ctype = ctype
+        if freefunc is not None:
+            assert isinstance(freefunc, basestring)
+        self.freefunc = freefunc
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.ctype == other.ctype,
+                self.freefunc == other.freefunc)
+
+    def __hash__(self):
+        return hash((type(self), self.ctype, self.freefunc))
+
+    def filter(self, data, strict=False, allow_downcast=None):
+        if data is not None:
+            raise TypeError("only None is valid")
+
+    def is_valid_value(self, a):
+        return a is None
+
+    def c_declare(self, name, sub, check_input=True):
+        return """
+        %(ctype)s %(name)s;
+        """ % dict(ctype=self.ctype, name=name)
+
+    def c_init(self, name, sub):
+        return "%(name)s = NULL;" % dict(name=name)
+
+    def c_extract(self, name, sub, check_input=True):
+        if PY3:
+            s = """
+  %(name)s = (%(ctype)s)PyCapsule_GetPointer(py_%(name)s, NULL);
+  if (%(name)s == NULL) %(fail)s
+"""
+        else:
+            s = """
+  %(name)s = (%(ctype)s)PyCObject_AsVoidPtr(py_%(name)s);
+"""
+        return s % dict(name=name, ctype=self.ctype, fail=sub['fail'])
+
+    def c_sync(self, name, sub):
+        freefunc = self.freefunc
+        if freefunc is None:
+            freefunc = "NULL"
+        s = """
+Py_XDECREF(py_%(name)s);
+if (%(name)s == NULL) {
+  py_%(name)s = Py_None;
+  Py_INCREF(py_%(name)s);
+} else """
+        if PY3:
+            s += """{
+  py_%(name)s = PyCapsule_New((void *)%(name)s, NULL,
+                              (void (*)(void *))%(freefunc)s);
+}"""
+        else:
+            s += """{
+  py_%(name)s = PyCObject_FromVoidPtr((void *)%(name)s,
+                                      (void (*)(void *))%(freefunc)s);
+}"""
+        if self.freefunc is not None:
+            s += """
+if (py_%(name)s == NULL) { %(freefunc)s(%(name)s); }
+"""
+        return s % dict(name=name, freefunc=freefunc)
+
+    def c_cleanup(self, name, sub):
+        # No need to do anything here since the CObject/Capsule will
+        # free the data for us when released.
+        return ""
+
+    def c_code_cache_version(self):
+        return (1,)
+
+    def __str__(self):
+        return "%s{%s}" % (self.__class__.__name__, self.ctype)
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -2,32 +2,53 @@ import copy
 import os

 import theano
-from theano import Apply
-from theano import tensor
+from theano import Apply, tensor
+from theano.gof.type import CDataType
 from theano.compat.six import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.sandbox.cuda.blas import GpuConv
+from theano.compat import PY3

-class GpuDnnConv(GpuOp):
-    __props__ = ('border_mode',)
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

-    def __init__(self, border_mode):
-        self.border_mode = border_mode
+class DnnBase(GpuOp):
+    """
+    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
+    """
+    def c_headers(self):
+        return ['cudnn.h', 'cudnn_helper.h']

-    def make_node(self, img, kern):
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]

-        broadcastable = (img.type.broadcastable[0],
-                         kern.type.broadcastable[0],
-                         False, False)
+    def c_libraries(self):
+        return ['cudnn']
+
+    def c_support_code(self):
+        return """
+cudnnHandle_t _handle = NULL;
+"""
+
+    def c_init_code(self):
+        if PY3:
+            error_out = "NULL"
+        else:
+            error_out = ""
+        return ["""{
+cudnnStatus_t err;
+if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
+               cudnnGetErrorString(err));
+  return %s;
+}
+}""" % (error_out,)]

-        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+class GpuDnnConvDesc(GpuOp):
+    __props__ = ('border_mode', 'subsample', 'conv_mode')

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -38,28 +59,121 @@ class GpuDnnConv(GpuOp):
    def c_libraries(self):
        return ['cudnn']

+    def c_compiler(self):
+        return NVCC_compiler
+
+    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
+        assert border_mode in ('valid', 'full')
+        self.border_mode = border_mode
+        assert len(subsample) == 2
+        self.subsample = subsample
+        assert conv_mode in ('conv', 'cross')
+        self.conv_mode = conv_mode
+
+    def make_node(self, img_shape, kern_shape):
+        if img_shape.type.ndim != 1 and img_shape.type.dtype != numpy.int64:
+            raise TypeError('img must be 1D shape tensor')
+        if kern_shape.type.ndim != 1 and kern_shape.type.dtype != numpy.int64:
+            raise TypeError('kern must be 1D shape tensor')
+
+        return Apply(self, [img_shape, kern_shape],
+                     [CDataType("cudnnConvolutionDescriptor_t")()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        img_shape, kern_shape = inputs
+        desc, = outputs
+
+        if self.border_mode == "valid":
+            bmode = 1
+        else:
+            assert self.border_mode == "full"
+            bmode = 0
+
+        if self.conv_mode == 'conv':
+            conv_flag = 'CUDNN_CONVOLUTION'
+        else:
+            conv_flag = 'CUDNN_CROSS_CORRELATION'
+
+        return """
+{
+  cudnnStatus_t err;
+  int pad_h%(name)s;
+  int pad_w%(name)s;
+
+  if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
+                 "descriptor: %%s", cudnnGetErrorString(err));
+    %(fail)s
+  }
+
+  if (%(bmode)d == 1) {
+    pad_h%(name)s = 0;
+    pad_w%(name)s = 0;
+  } else if (%(bmode)d == 0) {
+    pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
+    pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
+  } else {
+    PyErr_SetString(PyExc_ValueError, "bad border mode");
+    %(fail)s
+  }
+  err = cudnnSetConvolutionDescriptorEx(
+  %(desc)s,
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
+  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
+  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
+  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
+  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
+  pad_h%(name)s,
+  pad_w%(name)s,
+  %(subsx)d, %(subsy)d, 1, 1,
+  %(conv_flag)s
+  );
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
+                 cudnnGetErrorString(err));
+    %(fail)s
+  }
+}
+""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
+           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
+           subsx=self.subsample[0], subsy=self.subsample[1])
+
+    def c_code_cache_version(self):
+        return (1,)
+
+
+class GpuDnnConvBase(DnnBase):
+    __props__ = ()
+
+    def make_node(self, img, kern, desc):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = (img.type.broadcastable[0],
+                         kern.type.broadcastable[0],
+                         False, False)
+
+        return Apply(self, [img, kern, desc],
+                     [CudaNdarrayType(broadcastable)()])
+
    def c_support_code_struct(self, node, struct_id):
        return """
-cudnnHandle_t handle%(id)d;
 cudnnTensor4dDescriptor_t input%(id)d;
 cudnnTensor4dDescriptor_t output%(id)d;
 cudnnFilterDescriptor_t kerns%(id)d;
-cudnnConvolutionDescriptor_t op%(id)d;
 """ % dict(id=struct_id)

    def c_init_code_struct(self, node, struct_id, sub):
        return """
-handle%(id)d = NULL;
+cudnnStatus_t err%(id)d;
 input%(id)d = NULL;
 output%(id)d = NULL;
 kerns%(id)d = NULL;
-op%(id)d = NULL;
-cudnnStatus_t err%(id)d;
-if ((err%(id)d = cudnnCreate(&handle%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
-               cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
 if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
               "(inp): %%s", cudnnGetErrorString(err%(id)d));
@@ -75,11 +189,6 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC
               cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
-if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
-               "descriptor: %%s", cudnnGetErrorString(err%(id)d));
-  %(fail)s
-}
 """ % dict(id=struct_id, fail=sub['fail'])

    def c_cleanup_code_struct(self, node, struct_id):
@@ -87,123 +196,109 @@ if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SU
 cudnnDestroyTensor4dDescriptor(input%(id)d);
 cudnnDestroyTensor4dDescriptor(output%(id)d);
 cudnnDestroyFilterDescriptor(kerns%(id)d);
-cudnnDestroyConvolutionDescriptor(op%(id)d);
-cudnnDestroy(handle%(id)d);
 """ % dict(id=struct_id)

-    def c_code(self, node, name, inputs, outputs, sub):
-        img, kern = inputs
-        out, = outputs
-
-        if self.border_mode == "valid":
-            bmode = 1
-        else:
-            assert self.border_mode == "full"
-            bmode = 0
-
+    def c_set_tensor4d(self, var, desc, err, fail):
        return """
-cudnnStatus_t err%(name)s;
-int pad_w%(name)s;
-int pad_h%(name)s;
-
-if (!CudaNdarray_is_c_contiguous(%(img)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-  %(fail)s
-}
-
-if (!CudaNdarray_is_c_contiguous(%(kerns)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
-  %(fail)s
-}
-
-err%(name)s = cudnnSetTensor4dDescriptorEx(
-input%(id)d, CUDNN_DATA_FLOAT,
-CudaNdarray_HOST_DIMS(%(img)s)[0],
-CudaNdarray_HOST_DIMS(%(img)s)[1],
-CudaNdarray_HOST_DIMS(%(img)s)[2],
-CudaNdarray_HOST_DIMS(%(img)s)[3],
-CudaNdarray_HOST_STRIDES(%(img)s)[0],
-CudaNdarray_HOST_STRIDES(%(img)s)[1],
-CudaNdarray_HOST_STRIDES(%(img)s)[2],
-CudaNdarray_HOST_STRIDES(%(img)s)[3]
+%(err)s = cudnnSetTensor4dDescriptorEx(
+%(desc)s, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(var)s)[0],
+CudaNdarray_HOST_DIMS(%(var)s)[1],
+CudaNdarray_HOST_DIMS(%(var)s)[2],
+CudaNdarray_HOST_DIMS(%(var)s)[3],
+CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
+CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
+CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
+CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
 );
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+if (%(err)s != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
-               cudnnGetErrorString(err%(name)s));
+               cudnnGetErrorString(%(err)s));
  %(fail)s
 }
-err%(name)s = cudnnSetFilterDescriptor(
-kerns%(id)d, CUDNN_DATA_FLOAT,
-CudaNdarray_HOST_DIMS(%(kerns)s)[0],
-CudaNdarray_HOST_DIMS(%(kerns)s)[1],
-CudaNdarray_HOST_DIMS(%(kerns)s)[2],
-CudaNdarray_HOST_DIMS(%(kerns)s)[3]
+""" % dict(var=var, err=err, desc=desc, fail=fail)
+
+    def c_set_filter(self, var, desc, err, fail):
+        return """
+%(err)s = cudnnSetFilterDescriptor(
+%(desc)s, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(var)s)[0],
+CudaNdarray_HOST_DIMS(%(var)s)[1],
+CudaNdarray_HOST_DIMS(%(var)s)[2],
+CudaNdarray_HOST_DIMS(%(var)s)[3]
 );
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+if (%(err)s != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
-               cudnnGetErrorString(err%(name)s));
+               cudnnGetErrorString(%(err)s));
  %(fail)s
 }
-if (%(bmode)d == 1) {
-  pad_h%(name)s = 0;
-  pad_w%(name)s = 0;
-} else if (%(bmode)d == 0) {
-  pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1;
-  pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
-} else {
-  PyErr_SetString(PyExc_ValueError, "bad border mode");
-  %(fail)s
-}
-err%(name)s = cudnnSetConvolutionDescriptor(
-op%(id)d, input%(id)d, kerns%(id)d, 
-pad_h%(name)s,
-pad_w%(name)s,
-1, 1, 1, 1,
-CUDNN_CONVOLUTION
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
+""" % dict(var=var, desc=desc, err=err, fail=fail)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        desc = inputs[2]
+        out, = outputs
+
+        checks = []
+        for v in inputs[:2]:
+            checks.append("""
+if (!CudaNdarray_is_c_contiguous(%s)) {
+  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+  %s
 }
+""" % (v, sub['fail']))
+
+        sets = []
+        for p, v, d in zip(inputs[:2], self.conv_inputs, self.conv_types[:2]):
+            sets.append(getattr(self, 'c_set_'+d)(p, v + str(sub['struct_id']),
+                                                  'err' + name, sub['fail']))
+
+        set_out = getattr(self, 'c_set_' + self.conv_types[2])(
+            out, self.conv_output + str(sub['struct_id']), 'err' + name,
+            sub['fail'])
+
+        return """
+cudnnStatus_t err%(name)s;
+
+%(checks)s
+
+%(sets)s
+
 {
-int out_dims[4];
-err%(name)s = cudnnGetOutputTensor4dDim(
-op%(id)d, CUDNN_CONVOLUTION_FWD,
-&out_dims[0], &out_dims[1],
-&out_dims[2], &out_dims[3]
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
-  %(fail)s
+  int out_dims[4];
+  err%(name)s = cudnnGetOutputTensor4dDim(
+  %(desc)s, %(path)s,
+  &out_dims[0], &out_dims[1],
+  &out_dims[2], &out_dims[3]
+  );
+  if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not get output sizes: %%s",
+                 cudnnGetErrorString(err%(name)s));
+    %(fail)s
+  }
+  // workaround for cudnn R1 bug
+  if (%(path)s == CUDNN_CONVOLUTION_WEIGHT_GRAD &&
+      (out_dims[0] != CudaNdarray_HOST_DIMS(%(input2)s)[1] ||
+       out_dims[1] != CudaNdarray_HOST_DIMS(%(input1)s)[1])) {
+    out_dims[0] = CudaNdarray_HOST_DIMS(%(input2)s)[1];
+    out_dims[1] = CudaNdarray_HOST_DIMS(%(input1)s)[1];
+    // This is a horrible hack that is unfortulately necessary
+    int *dd = (int *)%(desc)s;
+    out_dims[2] = dd[5];
+    out_dims[3] = dd[6];
+  }
+  if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
+    %(fail)s
+  }
 }
-}
-err%(name)s = cudnnSetTensor4dDescriptorEx(
-output%(id)d, CUDNN_DATA_FLOAT,
-CudaNdarray_HOST_DIMS(%(out)s)[0],
-CudaNdarray_HOST_DIMS(%(out)s)[1],
-CudaNdarray_HOST_DIMS(%(out)s)[2],
-CudaNdarray_HOST_DIMS(%(out)s)[3],
-CudaNdarray_HOST_STRIDES(%(out)s)[0],
-CudaNdarray_HOST_STRIDES(%(out)s)[1],
-CudaNdarray_HOST_STRIDES(%(out)s)[2],
-CudaNdarray_HOST_STRIDES(%(out)s)[3]
-);
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-err%(name)s = cudnnConvolutionForward(
-handle%(id)d,
-input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
-kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
-op%(id)d,
-output%(id)d, CudaNdarray_DEV_DATA(%(out)s),
+
+%(set_out)s
+
+err%(name)s = %(method)s(
+_handle,
+%(input1_desc)s, CudaNdarray_DEV_DATA(%(input1)s),
+%(input2_desc)s, CudaNdarray_DEV_DATA(%(input2)s),
+%(desc)s,
+%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s),
 CUDNN_RESULT_NO_ACCUMULATE
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
@@ -211,25 +306,77 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
-""" % dict(img=img, kerns=kern, out=out, bmode=bmode,
-           fail=sub['fail'], id=sub['struct_id'], name=name)
+""" % dict(out=out, desc=desc, fail=sub['fail'], id=sub['struct_id'],
+           name=name, checks='\n'.join(checks), sets='\n'.join(sets),
+           set_out=set_out, input1=inputs[0], input2=inputs[1],
+           input1_desc=self.conv_inputs[0]+str(sub['struct_id']),
+           input2_desc=self.conv_inputs[1]+str(sub['struct_id']),
+           output_desc=self.conv_output+str(sub['struct_id']),
+           method=self.conv_op, path=self.path_flag)

    def c_code_cache_version(self):
-        return (4,)
+        return (7,)
+
+
+class GpuDnnConv(GpuDnnConvBase):
+    conv_inputs = 'input', 'kerns'
+    conv_output = 'output'
+    conv_types = 'tensor4d', 'filter', 'tensor4d'
+    conv_op = 'cudnnConvolutionForward'
+    path_flag = 'CUDNN_CONVOLUTION_FWD'
+
+    def grad(self, inp, grads):
+        img, kerns, desc = inp
+        top, = grads
+
+        top = gpu_contiguous(top)
+
+        d_img = GpuDnnConvGradI()(kerns, top, desc)
+        d_kerns = GpuDnnConvGradW()(img, top, desc)
+
+        return d_img, d_kerns, theano.gradient.DisconnectedType()()
+
+    def connection_pattern(self, node):
+        # not connected to desc
+        return [[1], [1], [0]]
+
+
+class GpuDnnConvGradW(GpuDnnConvBase):
+    conv_inputs = 'input', 'output',
+    conv_output = 'kerns'
+    conv_types = 'tensor4d', 'tensor4d', 'filter'
+    path_flag = 'CUDNN_CONVOLUTION_WEIGHT_GRAD'
+    conv_op = 'cudnnConvolutionBackwardFilter'
+
+
+class GpuDnnConvGradI(GpuDnnConvBase):
+    conv_inputs = 'kerns', 'output',
+    conv_output = 'input'
+    conv_types = 'filter', 'tensor4d', 'tensor4d'
+    path_flag = 'CUDNN_CONVOLUTION_DATA_GRAD'
+    conv_op = 'cudnnConvolutionBackwardData'


 from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
                                     gpu_optimizer)

+def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
+             conv_mode='conv'):
+    img = gpu_contiguous(img)
+    kerns = gpu_contiguous(kerns)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns.shape)
+    return GpuDnnConv()(img, kerns, desc)
+
 @local_optimizer([GpuConv])
 def local_conv_dnn(node):
    if isinstance(node.op, GpuConv):
-        if (node.op.subsample != (1, 1) or
-            node.op.border_mode not in ['full', 'valid']):
+        if node.op.border_mode not in ['full', 'valid']:
            return
        img, kern = node.inputs
        border_mode = node.op.border_mode
-        return [GpuDnnConv(border_mode)(gpu_contiguous(img),
-                                        gpu_contiguous(kern))]
+        subsample = node.op.subsample
+        return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern),
+                         border_mode=border_mode, subsample=subsample)]

 gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -4,7 +4,7 @@ Tests for GPU convolution
 import sys
 import time
 import unittest
-
+import traceback

 import numpy

@@ -19,14 +19,14 @@ except ImportError:
 import theano
 from theano import tensor
 from theano.gof.python25 import any
-from theano.tests.unittest_tools import seed_rng
+from theano.tests.unittest_tools import seed_rng, assert_allclose

 # Skip test if cuda is not available.
 from theano.sandbox import cuda
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

-from theano.sandbox.cuda.dnn import GpuDnnConv
+from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvBase, dnn_conv

 #needed as the gpu conv don't have a perform implementation.
 if theano.config.mode == 'FAST_COMPILE':
@@ -48,7 +48,7 @@ if device_id is None:
             enable_cuda=False,
             test_driver=True)
    device_id = theano.sandbox.cuda.use.device_number
-    
+
 cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
 device_prop = cuda_ndarray.device_properties(device_id)

@@ -173,40 +173,30 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    t2 = None
-    rval = True
-    try:
-        t0 = time.time()
-        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-        t1 = time.time()
-        i = cuda_tensor4()
-        k = cuda_tensor4()
-        op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
-                                              subsample=subsample,
-                                              version=version,
-                                              verbose=verbose,
-                                              kshp=compile_kshp)(i, k)
-        f = theano.function([i, k], op, mode=theano_mode)
-        if cls is not None:
-            assert any([isinstance(node.op, cls)
-                        for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
-        gpuval = f(img, kern)
-        t2 = time.time()
-        for i in range(nb_iter):
-            gpuval2 = f(img, kern)
-            assert numpy.allclose(numpy.asarray(gpuval),
-                                  numpy.asarray(gpuval2))
-            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
-        gpuval = numpy.asarray(gpuval)
-        if gpuval.shape != cpuval.shape:
-            print >> sys.stdout, "ERROR: shape mismatch",
-            print >> sys.stdout, gpuval.shape, cpuval.shape
-            rval = False
-        if rval:
-            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
-            assert numpy.all(numpy.isfinite(gpuval)), gpuval
-    except NotImplementedError, e:
-        print >> sys.stdout, '_params_allgood Failed allclose', e
-        rval = False
+
+    t0 = time.time()
+    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
+    t1 = time.time()
+    i = cuda_tensor4()
+    k = cuda_tensor4()
+    op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
+                                          subsample=subsample,
+                                          version=version,
+                                          verbose=verbose,
+                                          kshp=compile_kshp)(i, k)
+    f = theano.function([i, k], op, mode=theano_mode)
+    if cls is not None:
+        assert any([isinstance(node.op, cls)
+                    for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
+    gpuval = f(img, kern)
+    t2 = time.time()
+    for i in range(nb_iter):
+        gpuval2 = f(img, kern)
+        assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
+    gpuval = numpy.asarray(gpuval)
+    assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape)
+    assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol)
+    assert numpy.all(numpy.isfinite(gpuval)), gpuval

    if (t2 is not None):
        if mode == 'valid':
@@ -221,37 +211,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
            print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
            print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
-    if not rval:
-        print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
-                              ' FAILED for ishape, kshape, mode, subsample,' +
-                              ' img_stride, kern_stride, version', ishape,
-                              kshape, mode, subsample, img_stride, kern_stride,
-                              version)
-        diff = cpuval - gpuval
-        diffabs = numpy.absolute(diff)
-        pr_diff = diffabs / numpy.absolute(cpuval)
-        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
-        print "max absolute diff:", (diffabs.max(), "avg abs diff:",
-                                     numpy.average(diffabs))
-        print "median abs diff:", (numpy.median(diffabs), "nb close:",
-                                   nb_close, "/", diff.size)
-        print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
-                                    numpy.average(pr_diff))
-    if not rval and print_ != False:
-        if npy_img.shape[0] > 5:
-            print "img", npy_img[0]
-            print "kern", npy_kern[0]
-            print "gpu", gpuval[0][0]
-            print "cpu", cpuval[0][0]
-            print "diff", diff[0][0]
-        else:
-            print "img", npy_img
-            print "kern", npy_kern
-            print "gpu", gpuval
-            print "cpu", cpuval
-            print "diff", diff
-
-    return rval


 def exec_conv(version, shapes, verbose, random, mode,
@@ -259,46 +218,13 @@ def exec_conv(version, shapes, verbose, random, mode,
              theano_mode=theano_mode, cls=None):
    if verbose > 0:
        _params_allgood_header()
-    nb_failed = 0
-    nb_tests = 0

-    failed_version = set()
-    failed_id = []
    for ver in version:
        for id, (ishape, kshape, subshape,
                 istride, kstride) in enumerate(shapes):
-            ret = False
-            try:
-                ret = _params_allgood(ishape,
-                        kshape,
-                        mode,
-                        subsample=subshape,
-                        img_stride=istride,
-                        kern_stride=kstride,
-                        version=ver,
-                        verbose=verbose,
-                        random=random,
-                        id=id,
-                        print_=print_,
-                        rtol=rtol,
-                        ones=ones,
-                        theano_mode=theano_mode,
-                        cls=cls)
-            except Exception, e:
-                print ver, id, (ishape, kshape, subshape, istride, kstride)
-                print "Exception", type(e), e
-                pass
-            if not ret:
-                failed_version.add(ver)
-                failed_id.append(id)
-                nb_failed += 1
-            nb_tests += 1
-    if nb_failed > 0:
-        print "nb_failed", nb_failed, "on", nb_tests,
-        print "failed_version", failed_version, "failed_id", failed_id
-        assert nb_failed == 0, nb_failed
-    else:
-        print 'Executed', nb_tests, 'different shapes'
+            yield (_params_allgood, ishape, kshape, mode, subshape,
+                   istride, kstride, ver, verbose, random, print_, id,
+                   rtol, 1e-8, 0, ones, None, theano_mode, cls)


 def get_basic_shapes():
@@ -453,8 +379,9 @@ def test_valid_0_2():
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
+    for t in exec_conv(version, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones, rtol=1.1e-5):
+        yield t


 def test_valid_1_3_11_12():
@@ -483,8 +410,9 @@ def test_valid_1_3_11_12():
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
+    for t in exec_conv(version, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones, rtol=1.1e-5):
+        yield t


 def test_valid_4():
@@ -515,8 +443,9 @@ def test_valid_4():
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
+    for t in exec_conv(version, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones, rtol=1.1e-5):
+        yield t


 def test_valid_5():
@@ -532,7 +461,6 @@ def test_valid_5():
        random = False
    shapes2 = []

-#    print len(shapes)
    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
@@ -545,10 +473,10 @@ def test_valid_5():
        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
-#    print len(shapes2)

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
+    for t in exec_conv(version, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones, rtol=1.1e-5):
+        yield t


 def test_valid_7_8_13():
@@ -567,7 +495,6 @@ def test_valid_7_8_13():
        random = False
    shapes2 = []

-#    print len(shapes)
    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
@@ -580,10 +507,10 @@ def test_valid_7_8_13():
        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
-#    print len(shapes2)

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
+    for t in exec_conv(version, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones, rtol=1.1e-5):
+        yield t


 def test_valid_9_10():
@@ -599,7 +526,6 @@ def test_valid_9_10():
        random = False
    shapes2 = []

-#    print len(shapes)
    for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
        oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
                                                  numpy.asarray(kshape[2:]) +
@@ -611,19 +537,16 @@ def test_valid_9_10():
        if subshape == (1, 1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))
    shapes = shapes2
-#    print len(shapes2)

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
+    for t in exec_conv(version, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones, rtol=1.1e-5):
+        yield t


 def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
    seed_rng()
    shapes = get_valid_shapes()

-    #shapes=shapes[400:426]
-    # I put -1 in case we forget to add version in the test to.
-    # I put -2 to test the reference version.
    verbose = 0

    random = True
@@ -634,26 +557,30 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):

    shapes += extra_shapes

-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5,
-              theano_mode=mode, cls=cls)
+    return exec_conv(version, shapes, verbose, random, 'valid',
+                     print_=print_, ones=ones, rtol=1.1e-5,
+                     theano_mode=mode, cls=cls)


 def test_valid():
-    _test_valid(None, version=[-2, -1, 6])
+    for t in _test_valid(None, version=[-2, -1, 6]):
+        yield t


 def test_gemm_valid():
    extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
    extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))

-    _test_valid(cuda.blas.BaseGpuCorrMM,
-                mode=theano_mode.including("conv_gemm"),
-                extra_shapes=extra_shapes)
+    for t in _test_valid(cuda.blas.BaseGpuCorrMM,
+                         mode=theano_mode.including("conv_gemm"),
+                         extra_shapes=extra_shapes):
+        yield t
+


 def test_dnn_valid():
-    _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))
+    for t in _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn")):
+        yield t


 def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
@@ -710,30 +637,32 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
            , ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
            ]

-#    shapes=shapes[:277]
    verbose = 0
    random = True

    shapes += extra_shapes

-    exec_conv(version, shapes, verbose, random, 'full',
-              theano_mode=mode, cls=cls)
+    return exec_conv(version, shapes, verbose, random, 'full',
+                     theano_mode=mode, cls=cls)


 def test_full():
-    _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
+    for t in _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5]):
+        yield t


 def test_gemm_full():
-    _test_full(cuda.blas.BaseGpuCorrMM,
-               mode=theano_mode.including("conv_gemm"))
+    for t in _test_full(cuda.blas.BaseGpuCorrMM,
+                        mode=theano_mode.including("conv_gemm")):
+        yield t


 def test_dnn_full():
-    _test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))
+    for t in _test_full(GpuDnnConv, mode=theano_mode.including("cudnn")):
+        yield t


-def test_subsample(conv_gemm=False):
+def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
    seed_rng()
    shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
              ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
@@ -748,8 +677,6 @@ def test_subsample(conv_gemm=False):

    # We put only the version that implement the subsample to make the
    # test faster.
-    version_valid = [-2, -1, 1, 3, 11, 12]
-    version_full = [-2, -1]
    verbose = 0
    random = True
    print_ = False
@@ -757,26 +684,32 @@ def test_subsample(conv_gemm=False):
    if ones:
        random = False

-    if conv_gemm:
-        # Test the GpuCorrMM version
-        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.BaseGpuCorrMM
-        # dummy version; not used by GpuCorrMM so one version is enough
-        version_valid = version_full = [-1]
-    else:
-        mode = theano_mode
-        cls = None
+    for t in exec_conv(version_valid, shapes, verbose, random, 'valid',
+                       print_=print_, ones=ones,
+                       theano_mode=mode, cls=cls):
+        yield t
+    for t in exec_conv(version_full, shapes, verbose, random, 'full',
+                       print_=print_, ones=ones,
+                       theano_mode=mode, cls=cls):
+        yield t

-    exec_conv(version_valid, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones,
-              theano_mode=mode, cls=cls)
-    exec_conv(version_full, shapes, verbose, random, 'full',
-              print_=print_, ones=ones,
-              theano_mode=mode, cls=cls)
+
+def test_subsample():
+    for t in _test_subsample(None, theano_mode,
+                             version_valid=[-2, -1, 1, 3, 11, 12],
+                             version_full=[-2, -1]):
+        yield t


 def test_gemm_subsample():
-    test_subsample(conv_gemm=True)
+    for t in _test_subsample(cuda.blas.BaseGpuCorrMM,
+                             theano_mode.including("conv_gemm")):
+        yield t
+
+
+def test_dnn_subsample():
+    for t in _test_subsample(GpuDnnConv, theano_mode.including('cudnn')):
+        yield t


 class TestConv2DGPU(unittest.TestCase):
@@ -850,58 +783,127 @@ class TestConv2DGPU(unittest.TestCase):
            theano_mode = theano_mode_orig


+def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
+                  direction):
+    ishape = (bs, ch, rImg1, rImg2)
+    kshape = (nf, ch, rFlt1, rFlt2)
+    subsample = (subsx, subsy)
+
+    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
+    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
+
+    i = cuda_tensor4()
+    k = cuda_tensor4()
+
+    if direction == 'fprop':
+        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
+        op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
+                                                subsample=subsample)(i, k)
+        f = theano.function([i, k], op, mode=theano_mode)
+        gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
+    elif direction == 'bprop img':
+        cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
+        op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
+            border_mode='valid', subsample=subsample)(i, k)
+        f = theano.function([i, k], op, mode=theano_mode)
+        gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
+    elif direction == 'bprop kern':
+        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
+        op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
+            border_mode='valid', subsample=subsample)(i, k)
+        f = theano.function([i, k], op, mode=theano_mode)
+        gpuval = numpy.array(f(
+                npy_img.transpose(1, 0, 2, 3),
+                npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
+            1, 0, 2, 3)
+
+    assert_allclose(cpuval, gpuval, rtol=1e-4)
+
+
 def test_gemm_directly():
-    for direction in ['fprop', 'bprop img', 'bprop kern']:
-        print 'Testing direction: ' + direction
-        for bs in range(1, 5):
-            for ch in range(1,4):
-                for nf in range(1,4):
-                    for rImg1 in range(5, 9):
-                        for rImg2 in range(5, 9):
-                            for rFlt1 in range(2, 4):
-                                for rFlt2 in range(2, 4):
-                                    for subsx in range(1, 3) if direction == 'fprop' else [1]:
-                                        for subsy in range(1, 3) if direction == 'fprop' else [1]:
-                                            ishape = (bs, ch, rImg1, rImg2)
-                                            kshape = (nf, ch, rFlt1, rFlt2)
-                                            subsample = (subsx, subsy)
-
-                                            npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
-                                            npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-
-                                            i = cuda_tensor4()
-                                            k = cuda_tensor4()
-
-                                            if direction == 'fprop':
-                                                cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
-                                                op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
-                                                        subsample=subsample)(i, k)
-                                                f = theano.function([i, k], op, mode=theano_mode)
-                                                gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
-                                            elif direction == 'bprop img':
-                                                cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
-                                                op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(border_mode='valid',
-                                                        subsample=subsample)(i, k)
-                                                f = theano.function([i, k], op, mode=theano_mode)
-                                                gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
-                                            elif direction == 'bprop kern':
-                                                cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
-                                                op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(border_mode='valid',
-                                                        subsample=subsample)(i, k)
-                                                f = theano.function([i, k], op, mode=theano_mode)
-                                                gpuval = numpy.array(f(npy_img.transpose(1, 0, 2, 3),
-                                                        npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(1, 0, 2, 3)
-
-                                            if not numpy.allclose(cpuval, gpuval, rtol=1e-4):
-                                                print "Test failed for"
-                                                print "direction: ", direction
-                                                print "ishape: ", ishape
-                                                print "kshape: ", kshape
-                                                print "subsample: ", subsample
-                                                assert False
-
-
-def test_gemm_grads():
+    for bs in range(1, 5):
+        for ch in range(1,4):
+            for nf in range(1,4):
+                for rImg1 in range(5, 9):
+                    for rImg2 in range(5, 9):
+                        for rFlt1 in range(2, 4):
+                            for rFlt2 in range(2, 4):
+                                for direction in ['bprop img', 'bprop kern']:
+                                    yield (gemm_directly, bs, ch, nf, rImg1,
+                                           rImg2, rFlt1, rFlt2, 1, 1,
+                                           direction)
+
+                                for subsx in range(1, 3):
+                                    for subsy in range(1, 3):
+                                        yield (gemm_directly, bs, ch, nf,
+                                               rImg1, rImg2, rFlt1, rFlt2,
+                                               subsx, subsy, 'fprop')
+
+
+def gemm_op(mode, subsample):
+    pad = 'full' if mode == 'full' else (0, 0)
+    return theano.sandbox.cuda.blas.GpuCorrMM('valid', subsample, pad)
+
+
+def dnn_op(mode, subsample):
+    def f(img, kern):
+        return dnn_conv(img, kern, border_mode=mode, conv_mode='cross',
+                        subsample=subsample)
+    return f
+
+
+def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
+    ishape = (bs, ch, rImg1, rImg2)
+    kshape = (nf, ch, rFlt1, rFlt2)
+
+    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
+    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
+
+    i = cuda_tensor4()
+    k = cuda_tensor4()
+
+    # TODO: also test custom pad values
+    corr_op = op(mode, subsample)(i, k)
+    # try to compile reference implementation without shape,
+    # so we don't have to compile hundreds of versions
+    conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
+                                 border_mode=mode, subsample=subsample)
+    try:
+        conv_op_di = theano.grad(conv_op.sum(), i)
+        conv_op_dk = theano.grad(conv_op.sum(), k)
+    except Exception:
+        # compile with shape information only when needed
+        conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
+                                     ishape, kshape, mode, subsample)
+    conv_op_di = theano.grad(conv_op.sum(), i)
+    conv_op_dk = theano.grad(conv_op.sum(), k)
+    corr_op_di = theano.grad(corr_op.sum(), i)
+    corr_op_dk = theano.grad(corr_op.sum(), k)
+    outputs = [corr_op, conv_op,
+               corr_op_di, conv_op_di,
+               corr_op_dk, conv_op_dk]
+    try:
+        conv_op_dik = theano.grad(conv_op_di.sum(), k)
+        conv_op_dki = theano.grad(conv_op_dk.sum(), i)
+        corr_op_dik = theano.grad(corr_op_di.sum(), k)
+        corr_op_dki = theano.grad(corr_op_dk.sum(), i)
+        outputs.extend([corr_op_dik, conv_op_dik,
+                        corr_op_dki, conv_op_dki])
+    except Exception:
+        # skip if the reference implementation can't do it
+        pass
+
+    f = theano.function([i, k], outputs, mode=theano_mode)
+
+    allvals = f(npy_img, npy_kern)
+
+    for a, b, p in zip(allvals[::2], allvals[1::2],
+                       ('top', 'dtop/dbottom', 'dtop/dweight',
+                        'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
+        assert_allclose(a, b, rtol=1e-4)
+
+
+def test_conv_grads():
    for mode in 'valid', 'full':
        for bs in [1, 5]:
            for ch in [4]:
@@ -910,68 +912,11 @@ def test_gemm_grads():
                        for rImg2 in [2, 8]:
                            for rFlt1 in [1, 2]:
                                for rFlt2 in [1, 2]:
-                                    for subsx in [1, 2]:
-                                        for subsy in [1, 2] if subsx == 1 else [2]:
-                                            ishape = (bs, ch, rImg1, rImg2)
-                                            kshape = (nf, ch, rFlt1, rFlt2)
-                                            subsample = (subsx, subsy)
-
-                                            npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
-                                            npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-
-                                            i = cuda_tensor4()
-                                            k = cuda_tensor4()
-
-                                            pad = 'full' if mode == 'full' else (0, 0)
-                                            # TODO: also test custom pad values
-                                            corr_op = theano.sandbox.cuda.blas.GpuCorrMM(
-                                                    'valid', subsample, pad)(i, k)
-                                            # try to compile reference implementation without shape,
-                                            # so we don't have to compile hundreds of versions
-                                            conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
-                                                    border_mode=mode, subsample=subsample)
-                                            try:
-                                                conv_op_di = theano.grad(conv_op.sum(), i)
-                                                conv_op_dk = theano.grad(conv_op.sum(), k)
-                                            except Exception:
-                                                # compile with shape information only when needed
-                                                conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
-                                                        ishape, kshape, mode, subsample)
-                                                conv_op_di = theano.grad(conv_op.sum(), i)
-                                                conv_op_dk = theano.grad(conv_op.sum(), k)
-                                            corr_op_di = theano.grad(corr_op.sum(), i)
-                                            corr_op_dk = theano.grad(corr_op.sum(), k)
-                                            outputs = [corr_op, conv_op,
-                                                    corr_op_di, conv_op_di,
-                                                    corr_op_dk, conv_op_dk]
-                                            try:
-                                                conv_op_dik = theano.grad(conv_op_di.sum(), k)
-                                                conv_op_dki = theano.grad(conv_op_dk.sum(), i)
-                                            except Exception:
-                                                # skip if the reference implementation can't do it
-                                                print ".",
-                                            else:
-                                                corr_op_dik = theano.grad(corr_op_di.sum(), k)
-                                                corr_op_dki = theano.grad(corr_op_dk.sum(), i)
-                                                outputs.extend([corr_op_dik, conv_op_dik,
-                                                        corr_op_dki, conv_op_dki])
-                                                print ":",
-
-                                            f = theano.function([i, k], outputs, mode=theano_mode)
-
-                                            allvals = f(npy_img, npy_kern)
-
-                                            for a, b, p in zip(allvals[::2], allvals[1::2],
-                                                    ('top', 'dtop/dbottom', 'dtop/dweight',
-                                                    'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
-                                                if (a.shape != b.shape) or not numpy.allclose(a, b, rtol=1e-4):
-                                                    print "Test failed for", p
-                                                    print "mode: ", mode
-                                                    print "ishape: ", ishape
-                                                    print "kshape: ", kshape
-                                                    print "subsample: ", subsample
-                                                    assert False
-                                            sys.stdout.flush()
+                                    for subsample in (1, 1), (1, 2), (2, 2):
+                                        for op in [gemm_op, dnn_op]:
+                                            yield (conv_grad, mode, bs, ch, nf,
+                                                   rImg1, rImg2, rFlt1, rFlt2,
+                                                   subsample, op)


 def benchmark():
@@ -1017,15 +962,15 @@ def benchmark():
         ,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full
            ]

-#    shapes_valid=shapes_valid[-1:]
-#    shapes_full=shapes_full[-1:]
    version = [-1]
    verbose = 1
    random = True

-    exec_conv(version, shapes_valid, verbose, random, 'valid',
-              print_=None, rtol=1e-3)
-    exec_conv(version, shapes_full, verbose, random, 'full')
+    for t in exec_conv(version, shapes_valid, verbose, random, 'valid',
+                       print_=None, rtol=1e-3):
+        t[0](*t[1:])
+    for t in exec_conv(version, shapes_full, verbose, random, 'full'):
+        t[0](*t[1:])


 def test_stack_rows_segfault_070312():