Merge pull request #5267 from gvtulder/f-abstractconv-differences

Minor inconsistency in AbstractConv_gradInput implementations

Merge pull request #5267 from gvtulder/f-abstractconv-differences
c072d669 · Frédéric Bastien · GitHub · 1a42bf9b · 7f1c3677 · c072d669
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -528,7 +528,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def c_code_cache_version(self):
        # Raise this whenever modifying the code below.
-        return (2,)
+        return (7,)
    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -558,19 +558,19 @@ class BaseGpuCorrMM(CGpuKernelBase):
        sub
            Dictionary of substitutions useable to help generating the C code.
        height
-            If self.subsample[0] != 1, a variable giving the height of the
+            Required if self.subsample[0] != 1, a variable giving the height of
-            filters for direction="backprop weights" or the height of the input
+            the filters for direction="backprop weights" or the height of the
-            images for direction="backprop inputs".
+            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the height of the
+            Required if self.border_mode == 'half', a variable giving the height
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        width
-            If self.subsample[1] != 1, a variable giving the width of the
+            Required if self.subsample[1] != 1, a variable giving the width of
-            filters for direction="backprop weights" or the width of the
+            the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the width of the
+            Required if self.border_mode == 'half', a variable giving the width
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        """
        dH, dW = self.subsample
@@ -599,18 +599,18 @@ class BaseGpuCorrMM(CGpuKernelBase):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when pad="half", we cannot infer the weight size.
+        if height:
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
-            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+            height = '-1'
+        if width:
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
        else:
-            height = '0'
            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
-            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+            width = '-1'
-        else:
-            width = '0'
        sync = ""
        if config.gpuarray.sync:
            sync = """
@@ -643,15 +643,15 @@ class BaseGpuCorrMM(CGpuKernelBase):
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
-    size_t kH, kW;
+    size_t kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyGpuArray_DIMS(weights)[2];
        kW = PyGpuArray_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
+        if (%(height)s != -1) {
-            // vertical subsampling or half padding, kernel height is specified
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -662,7 +662,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
            // explicit padding, we can infer the kernel height
            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -674,8 +674,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }
    // Implicit dilated kernel size
-    size_t dil_kH = (kH - 1) * dilH + 1;
+    dil_kH = (kH - 1) * dilH + 1;
-    size_t dil_kW = (kW - 1) * dilW + 1;
+    dil_kW = (kW - 1) * dilW + 1;
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
@@ -700,7 +700,9 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }
    // Infer output shape and type
-    size_t out_dim[4];
+    // The inferred shape can be negative.
+    long long out_dim[4];
+    size_t out_dim_size[4];
    int out_typecode;
    PyGpuContextObject *out_context;
    switch(direction) {
@@ -713,6 +715,20 @@ class BaseGpuCorrMM(CGpuKernelBase):
        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
        out_typecode = bottom->ga.typecode;
        out_context = bottom->context;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorrMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
@@ -723,27 +739,60 @@ class BaseGpuCorrMM(CGpuKernelBase):
        out_dim[3] = kW;  // how convenient
        out_typecode = top->ga.typecode;
        out_context = top->context;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        out_typecode = top->ga.typecode;
        out_context = top->context;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
        %(fail)s
    }
+    out_dim_size[0] = (size_t)out_dim[0];
+    out_dim_size[1] = (size_t)out_dim[1];
+    out_dim_size[2] = (size_t)out_dim[2];
+    out_dim_size[3] = (size_t)out_dim[3];
    // Prepare output array
-    if (theano_prep_output(&%(out)s, 4, out_dim, out_typecode, GA_C_ORDER, out_context) != 0)
+    if (theano_prep_output(&%(out)s, 4, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
    {
        PyErr_Format(PyExc_RuntimeError,
-                "BaseGpuCorrMM: Failed to allocate output of %%ld x %%ld x %%ld x %%ld",
+                "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
                out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
        %(fail)s
    }
@@ -875,15 +924,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
            raise TypeError('img must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1)'
                                 ' or border_mode == "half"')
+            height_width = []
+        else:
            height_width = [shape[0], shape[1]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
-        else:
-            height_width = []
        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False]
@@ -946,10 +995,12 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) and shape is None:
+        if shape is None:
+            if self.subsample != (1, 1):
                raise ValueError('shape must be given if subsample != (1, 1)')
-        height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
+            height_width = []
-        if height_width:
+        else:
+            height_width = [shape[0], shape[1]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
@@ -1074,7 +1125,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
    def c_code_cache_version(self):
        # raise this whenever modifying the code below.
-        return (2,)
+        return (7,)
    def c_code_helper(self, bottom, weights, top, direction, sub,
                      height=None, width=None, depth=None):
@@ -1105,26 +1156,26 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        sub
            Dictionary of substitutions useable to help generating the C code.
        height
-            If self.subsample[0] != 1, a variable giving the height of the
+            Required if self.subsample[0] != 1, a variable giving the height of
-            filters for direction="backprop weights" or the height of the input
+            the filters for direction="backprop weights" or the height of the
-            images for direction="backprop inputs".
+            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the height of the
+            Required if self.border_mode == 'half', a variable giving the height
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        width
-            If self.subsample[1] != 1, a variable giving the width of the
+            Required if self.subsample[1] != 1, a variable giving the width of
-            filters for direction="backprop weights" or the width of the
+            the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the width of the
+            Required if self.border_mode == 'half', a variable giving the width
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        depth
-            If self.subsample[2] != 1, a variable giving the depth of the
+            Required if self.subsample[2] != 1, a variable giving the depth of
-            filters for direction="backprop weights" or the depth of the
+            the filters for direction="backprop weights" or the depth of the
            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the depth of the
+            Required if self.border_mode == 'half', a variable giving the depth
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        """
        dH, dW, dD = self.subsample
@@ -1153,24 +1204,24 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when pad="half", we cannot infer the weight size.
+        if height:
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
-            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+            height = '-1'
+        if width:
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
        else:
-            height = '0'
            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
-            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+            width = '-1'
+        if depth:
+            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
        else:
-            width = '0'
            if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
-            if not depth:
                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
-            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
+            depth = '-1'
-        else:
-            depth = '0'
        sync = ""
        if config.gpuarray.sync:
            sync = """
@@ -1206,7 +1257,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
    // Obtain or infer kernel height, width and depth
    // (we need to know it early to be able to handle auto-padding)
-    size_t kH, kW, kD;
+    size_t kH, kW, kD, dil_kH, dil_kW, dil_kD;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyGpuArray_DIMS(weights)[2];
@@ -1214,8 +1265,8 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        kD = PyGpuArray_DIMS(weights)[4];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
+        if (%(height)s != -1) {
-            // vertical subsampling or half padding, kernel height is specified
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -1226,7 +1277,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
            // explicit padding, we can infer the kernel height
            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -1235,7 +1286,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        else {
            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((dD != 1) || (padD == -1)) {
+        if (%(depth)s != -1) {
            kD = %(depth)s;
        }
        else if (padD == -2) {
@@ -1247,9 +1298,9 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
    }
    // Implicit dilated kernel size
-    size_t dil_kH = (kH - 1) * dilH + 1;
+    dil_kH = (kH - 1) * dilH + 1;
-    size_t dil_kW = (kW - 1) * dilW + 1;
+    dil_kW = (kW - 1) * dilW + 1;
-    size_t dil_kD = (kD - 1) * dilD + 1;
+    dil_kD = (kD - 1) * dilD + 1;
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
@@ -1284,7 +1335,9 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
    }
    // Infer output shape and type
-    size_t out_dim[5];
+    // The inferred shape can be negative.
+    long long out_dim[5];
+    size_t out_dim_size[5];
    int out_typecode;
    PyGpuContextObject *out_context;
    switch(direction) {
@@ -1298,6 +1351,22 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
        out_typecode = bottom->ga.typecode;
        out_context = bottom->context;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorr3dMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                         PyGpuArray_DIMS(bottom)[4],
+                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                         PyGpuArray_DIMS(weights)[4],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width, depth)
@@ -1309,28 +1378,66 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        out_dim[4] = kD;
        out_typecode = top->ga.typecode;
        out_context = top->context;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorr3dMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                         PyGpuArray_DIMS(bottom)[4],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
+                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                         PyGpuArray_DIMS(top)[4]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width, depth)
        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
+        out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
        out_typecode = top->ga.typecode;
        out_context = top->context;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorr3dMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
+                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                         PyGpuArray_DIMS(weights)[4],
+                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                         PyGpuArray_DIMS(top)[4]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
        %(fail)s
    }
+    out_dim_size[0] = (size_t)out_dim[0];
+    out_dim_size[1] = (size_t)out_dim[1];
+    out_dim_size[2] = (size_t)out_dim[2];
+    out_dim_size[3] = (size_t)out_dim[3];
+    out_dim_size[4] = (size_t)out_dim[4];
    // Prepare output array
-    if (theano_prep_output(&%(out)s, 5, out_dim, out_typecode, GA_C_ORDER, out_context) != 0)
+    if (theano_prep_output(&%(out)s, 5, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
    {
        PyErr_Format(PyExc_RuntimeError,
-                "BaseGpuCorrMM: Failed to allocate output of %%ld x %%ld x %%ld x %%ld x %%ld",
+                "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
                out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
        %(fail)s
    }
@@ -1464,16 +1571,16 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
            raise TypeError('img must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1, 1)'
                                 ' or border_mode == "half"')
+            height_width_depth = []
+        else:
            height_width_depth = [shape[0], shape[1], shape[2]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
            assert shape[2].ndim == 0
-        else:
-            height_width_depth = []
        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False, False]
@@ -1536,10 +1643,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
            raise TypeError('kern must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) and shape is None:
+        if shape is None:
+            if self.subsample != (1, 1, 1):
                raise ValueError('shape must be given if subsample != (1, 1, 1)')
-        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
+            height_width_depth = []
-        if height_width_depth:
+        else:
+            height_width_depth = [shape[0], shape[1], shape[2]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
            assert shape[2].ndim == 0

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
@@ -425,9 +425,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    const size_t topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    const size_t topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const size_t topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
@@ -479,6 +487,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid correlation: im3d2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -530,6 +549,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid convolution: im3col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -581,9 +611,29 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
        }
+        if (batchSize == 0) {
+            err = GpuArray_memset(&weight->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -360,8 +360,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
@@ -411,6 +418,17 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid correlation: im2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -462,6 +480,17 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid convolution: im2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -516,6 +545,17 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -24,7 +24,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv3d,
                                              AbstractConv3d_gradWeights,
                                              AbstractConv3d_gradInputs,
-                                              get_conv_output_shape)
+                                              get_conv_output_shape,
+                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
@@ -979,11 +980,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        out_shp = (shape_i(kerns, 1, fgraph),
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+                   shape_i(img, 1, fgraph),
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
-            shape_i(kerns, 1, fgraph),
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
-            shape_i(img, 1, fgraph), shape2, shape3)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
        conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
@@ -997,11 +999,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        out_shp = (shape_i(img, 0, fgraph),
-        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
                   shape_i(kerns, 1, fgraph),
-                                                         shape2, shape3)
+                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
+                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
@@ -1021,6 +1024,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
                                    desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
@@ -1094,12 +1098,13 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        out_shp = (shape_i(kerns, 1, fgraph),
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+                   shape_i(img, 1, fgraph),
-        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1,
-            shape_i(kerns, 1, fgraph),
+                   shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
-            shape_i(img, 1, fgraph), shape2, shape3, shape4)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
        conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
@@ -1113,12 +1118,13 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        out_shp = (shape_i(img, 0, fgraph),
-        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
                   shape_i(kerns, 1, fgraph),
-                                                         shape2, shape3, shape4)
+                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
+                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1,
+                   shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
@@ -1138,6 +1144,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
                                    desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -39,11 +39,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 1;
  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
@@ -71,6 +66,20 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 1;
 #endif
+  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
+    int err2 = GpuArray_memset(&(*output)->ga, 0);
+    if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv could not fill the output with zeros: %d", err2);
+        return 1;
+    }
+    return 0;
+  }
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -38,11 +38,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 1;
  }
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
  switch (im->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
@@ -70,6 +65,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 1;
 #endif
+  if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
+    int err2 = GpuArray_memset(&(*input)->ga, 0);
+    if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %d", err2);
+        return 1;
+    }
+    return 0;
+  }
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;
@@ -77,6 +86,48 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_enter(c->ctx);
+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              PyGpuArray_NDIM(im), expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return 1;
+  }
+  if (PyGpuArray_NDIM(im) == 4) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  } else if (PyGpuArray_NDIM(im) == 5) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
+        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   expected_output_dims[4],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
+                   PyGpuArray_DIMS(output)[4]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
 #ifdef CHOOSE_ALGO
 #ifndef CHOOSE_ONCE
  reuse_algo = 1;

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -38,11 +38,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
@@ -70,6 +65,20 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
 #endif
+  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
+    int err2 = GpuArray_memset(&(*kerns)->ga, 0);
+    if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv grad wrt. weights could not fill the output with zeros: %d", err2);
+        return 1;
+    }
+    return 0;
+  }
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
@@ -77,6 +86,48 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_enter(c->ctx);
+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              PyGpuArray_NDIM(input), expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return 1;
+  }
+  if (PyGpuArray_NDIM(input) == 4) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
+                                     " but received gradient with shape %ldx%ldx%dx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  } else if (PyGpuArray_NDIM(input) == 5) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
+        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   expected_output_dims[4],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
+                   PyGpuArray_DIMS(output)[4]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
 #ifdef CHOOSE_ALGO
 #ifndef CHOOSE_ONCE
  reuse_algo = 1;

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest
+from nose.tools import assert_raises
 import numpy
@@ -49,6 +50,31 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip, target_op=GpuDnnConvGradI)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        if fd != (1, 1):
+            raise SkipTest("Doesn't have CUDNN implementation")
+        mode = mode_with_gpu
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
 class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -82,6 +108,31 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip, target_op=GpuDnnConvGradI)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        if fd != (1, 1, 1):
+            raise SkipTest("Doesn't have CUDNN implementation")
+        mode = mode_with_gpu
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
    @classmethod
@@ -115,6 +166,28 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
                           target_op=GpuCorrMM_gradInputs,
                           filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorrMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorrMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
 class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -148,6 +221,28 @@ class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
                           target_op=GpuCorr3dMM_gradInputs,
                           filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorr3dMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorr3dMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -12,6 +12,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
@@ -628,56 +629,50 @@ class TestDnnInferShapes(utt.InferShapeTester):
                        [(1, 1, 1), (2, 2, 2)],
                        'none')
-    def _test_conv_gradw(self, img, kerns, out, img_val, kern_vals, border_mode, conv_mode, subsample):
+    def _test_conv_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
+        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
+                                              border_mode, subsample)
        img_val = numpy.asarray(
-            img_val,
+            numpy.random.rand(*img_shape),
            dtype=theano.config.floatX
        )
-        kern_vals = numpy.asarray(
+        topgrad_vals = numpy.asarray(
-            kern_vals,
+            numpy.random.rand(*topgrad_shape),
            dtype=theano.config.floatX
        )
-        temp_img = img.dimshuffle(1, 0, 2, 3)
+        kerns_vals = numpy.zeros(kerns_shape, dtype=theano.config.floatX)
-        temp_kerns = kerns
+        kerns_shape = theano.shared(numpy.asarray(kerns_shape))
-        if conv_mode == 'conv':
-            temp_kerns = temp_kerns[:, :, ::-1, ::-1]
-        temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
-        shape = (
-            kern_vals.shape[1], img_val.shape[1],
-            img_val.shape[2] - kern_vals.shape[2] + 1,
-            img_val.shape[3] - kern_vals.shape[3] + 1
-        )
-        out_vals = numpy.zeros(shape, dtype=theano.config.floatX)
        desc = dnn.GpuDnnConvDesc(
            border_mode=border_mode,
            subsample=subsample,
            conv_mode=conv_mode,
            precision=set_precision(theano.config.floatX)
-        )(out.shape)
+        )(kerns_shape)
        conv_grad_w = dnn.GpuDnnConvGradW()(
-            temp_img,
+            img,
-            temp_kerns,
+            topgrad,
-            out,
+            kerns,
            desc,
        )
        self._compile_and_check(
-            [temp_img, temp_kerns, out],
+            [img, topgrad, kerns],
            [conv_grad_w],
-            [img_val, kern_vals, out_vals],
+            [img_val, topgrad_vals, kerns_vals],
            dnn.GpuDnnConvGradW
        )
    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
    def test_conv_gradw(self, border_mode, conv_mode):
        self._test_conv_gradw(T.tensor4('img'),
+                              T.tensor4('topgrad'),
                              T.tensor4('kerns'),
-                              T.tensor4('out'),
+                              (5, 2, 6, 13),
-                              numpy.random.rand(2, 5, 6, 8),
+                              (1, 2, 3, 7),
-                              numpy.random.rand(2, 1, 5, 6),
                              border_mode,
                              conv_mode,
                              (1, 1))

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -922,7 +922,7 @@ class BaseGpuCorrMM(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 26)
+        return (0, 30)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -960,19 +960,19 @@ class BaseGpuCorrMM(GpuOp):
        sub
            Dictionary of substitutions useable to help generating the C code.
        height
-            If self.subsample[0] != 1, a variable giving the height of the
+            Required if self.subsample[0] != 1, a variable giving the height of
-            filters for direction="backprop weights" or the height of the input
+            the filters for direction="backprop weights" or the height of the
-            images for direction="backprop inputs".
+            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the height of the
+            Required if self.border_mode == 'half', a variable giving the height
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        width
-            If self.subsample[1] != 1, a variable giving the width of the
+            Required if self.subsample[1] != 1, a variable giving the width of
-            filters for direction="backprop weights" or the width of the
+            the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".
-            If self.border_mode == 'half', a variable giving the width of the
+            Required if self.border_mode == 'half', a variable giving the width
-            filters for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        """
        dH, dW = self.subsample
@@ -1001,18 +1001,18 @@ class BaseGpuCorrMM(GpuOp):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when pad="half", we cannot infer the weight size.
+        if height:
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
-            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+            height = '-1'
+        if width:
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
        else:
-            height = 'NULL'
            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
-            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+            width = '-1'
-        else:
-            width = 'NULL'
        sub = sub.copy()
        sub.update(locals())
@@ -1035,15 +1035,15 @@ class BaseGpuCorrMM(GpuOp):
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW;
+    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = CudaNdarray_HOST_DIMS(weights)[2];
        kW = CudaNdarray_HOST_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
+        if (%(height)s != -1) {
-            // vertical subsampling or half padding, kernel height is specified
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -1054,7 +1054,7 @@ class BaseGpuCorrMM(GpuOp):
            // explicit padding, we can infer the kernel height
            kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -1066,8 +1066,8 @@ class BaseGpuCorrMM(GpuOp):
    }
    // Implicit dilated kernel size
-    int dil_kH = (kH - 1) * dilH + 1;
+    dil_kH = (kH - 1) * dilH + 1;
-    int dil_kW = (kW - 1) * dilW + 1;
+    dil_kW = (kW - 1) * dilW + 1;
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
@@ -1101,6 +1101,20 @@ class BaseGpuCorrMM(GpuOp):
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorrMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         CudaNdarray_HOST_DIMS(bottom)[0], CudaNdarray_HOST_DIMS(bottom)[1],
+                         CudaNdarray_HOST_DIMS(bottom)[2], CudaNdarray_HOST_DIMS(bottom)[3],
+                         CudaNdarray_HOST_DIMS(weights)[0], CudaNdarray_HOST_DIMS(weights)[1],
+                         CudaNdarray_HOST_DIMS(weights)[2], CudaNdarray_HOST_DIMS(weights)[3],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
@@ -1109,14 +1123,42 @@ class BaseGpuCorrMM(GpuOp):
        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
        out_dim[2] = kH;  // already inferred further above
        out_dim[3] = kW;  // how convenient
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         CudaNdarray_HOST_DIMS(bottom)[0], CudaNdarray_HOST_DIMS(bottom)[1],
+                         CudaNdarray_HOST_DIMS(bottom)[2], CudaNdarray_HOST_DIMS(bottom)[3],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                         CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
+                         CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                         CudaNdarray_HOST_DIMS(weights)[0], CudaNdarray_HOST_DIMS(weights)[1],
+                         CudaNdarray_HOST_DIMS(weights)[2], CudaNdarray_HOST_DIMS(weights)[3],
+                         CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
+                         CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
@@ -1274,15 +1316,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
            raise TypeError('img must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1)'
                                 ' or border_mode == "half"')
+            height_width = []
+        else:
            height_width = [shape[0], shape[1]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
-        else:
-            height_width = []
        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False]
@@ -1343,10 +1385,12 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) and shape is None:
+        if shape is None:
+            if self.subsample != (1, 1):
                raise ValueError('shape must be given if subsample != (1, 1)')
-        height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
+            height_width = []
-        if height_width:
+        else:
+            height_width = [shape[0], shape[1]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
@@ -1469,7 +1513,7 @@ class BaseGpuCorr3dMM(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 25)
+        return (0, 29)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -1510,26 +1554,26 @@ class BaseGpuCorr3dMM(GpuOp):
        sub
            Dictionary of substitutions useable to help generating the C code.
        height
-            If self.subsample[0] != 1, a variable giving the height
+            Required if self.subsample[0] != 1, a variable giving the height of
-            of the filters for direction="backprop weights" or the height of the
+            the filters for direction="backprop weights" or the height of the
            input images for direction="backprop inputs".
-            If self.pad == 'half', a variable giving the height of the filters
+            Required if self.border_mode == 'half', a variable giving the height
-            for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        width
-            If self.subsample[1] != 1, a variable giving the width
+            Required if self.subsample[1] != 1, a variable giving the width of
-            of the filters for direction="backprop weights" or the width of the
+            the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".
-            If self.pad == 'half', a variable giving the width of the filters
+            Required if self.border_mode == 'half', a variable giving the width
-            for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        depth
-            If self.subsample[2] != 1, a variable giving the depth
+            Required if self.subsample[2] != 1, a variable giving the depth of
-            of the filters for direction="backprop weights" or the depth of the
+            the filters for direction="backprop weights" or the depth of the
            input images for direction="backprop inputs".
-            If self.pad == 'half', a variable giving the depth of the filters
+            Required if self.border_mode == 'half', a variable giving the depth
-            for direction="backprop weights".
+            of the filters for direction="backprop weights".
-            Ignored otherwise.
+            Not required otherwise, but if a value is given this will be checked.
        """
        dH, dW, dD = self.subsample
@@ -1558,24 +1602,24 @@ class BaseGpuCorr3dMM(GpuOp):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when pad="half", we cannot infer the weight size.
+        if height:
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
-            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+            height = '-1'
+        if width:
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
        else:
-            height = 'NULL'
            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
-            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+            width = '-1'
+        if depth:
+            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
        else:
-            width = 'NULL'
            if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
-            if not depth:
                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
-            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
+            depth = '-1'
-        else:
-            depth = 'NULL'
        sub = sub.copy()
        sub.update(locals())
@@ -1601,7 +1645,7 @@ class BaseGpuCorr3dMM(GpuOp):
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW, kD;
+    int kH, kW, kD, dil_kH, dil_kW, dil_kD;
    if (direction != 1)
    {
      // weight is an input variable, we can just read its shape
@@ -1611,9 +1655,9 @@ class BaseGpuCorr3dMM(GpuOp):
    }
    else
    {
-      if ((dH != 1) || (padH == -1))
+      if (%(height)s != -1)
      {
-         // vertical subsampling or half padding, kernel height is specified
+         // kernel height is specified (perhaps vertical subsampling or half padding)
         kH = %(height)s;
      }
      else if (padH == -2)
@@ -1626,7 +1670,7 @@ class BaseGpuCorr3dMM(GpuOp):
        // explicit padding, we can infer the kernel height
        kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
      }
-      if ((dW != 1) || (padW == -1))
+      if (%(width)s != -1)
      {
        kW = %(width)s;
      }
@@ -1638,7 +1682,7 @@ class BaseGpuCorr3dMM(GpuOp):
      {
        kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
      }
-      if ((dD != 1) || (padD == -1))
+      if (%(depth)s != -1)
      {
        kD = %(depth)s;
      }
@@ -1653,9 +1697,9 @@ class BaseGpuCorr3dMM(GpuOp):
    }
    // Implicit dilated kernel size
-    int dil_kH = (kH - 1) * dilH + 1;
+    dil_kH = (kH - 1) * dilH + 1;
-    int dil_kW = (kW - 1) * dilW + 1;
+    dil_kW = (kW - 1) * dilW + 1;
-    int dil_kD = (kD - 1) * dilD + 1;
+    dil_kD = (kD - 1) * dilD + 1;
    // Auto-padding if requested
    if (padH == -1)
@@ -1707,6 +1751,22 @@ class BaseGpuCorr3dMM(GpuOp):
        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - ((CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorr3dMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         CudaNdarray_HOST_DIMS(bottom)[0], CudaNdarray_HOST_DIMS(bottom)[1],
+                         CudaNdarray_HOST_DIMS(bottom)[2], CudaNdarray_HOST_DIMS(bottom)[3],
+                         CudaNdarray_HOST_DIMS(bottom)[4],
+                         CudaNdarray_HOST_DIMS(weights)[0], CudaNdarray_HOST_DIMS(weights)[1],
+                         CudaNdarray_HOST_DIMS(weights)[2], CudaNdarray_HOST_DIMS(weights)[3],
+                         CudaNdarray_HOST_DIMS(weights)[4],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width, depth)
@@ -1716,23 +1776,53 @@ class BaseGpuCorr3dMM(GpuOp):
        out_dim[2] = kH;  // already inferred further above
        out_dim[3] = kW;  // how convenient
        out_dim[4] = kD;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorr3dMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         CudaNdarray_HOST_DIMS(bottom)[0], CudaNdarray_HOST_DIMS(bottom)[1],
+                         CudaNdarray_HOST_DIMS(bottom)[2], CudaNdarray_HOST_DIMS(bottom)[3],
+                         CudaNdarray_HOST_DIMS(bottom)[4],
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
+                         CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
+                         CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
+                         CudaNdarray_HOST_DIMS(top)[4]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width, depth)
        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
+        out_dim[4] = (%(depth)s != -1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "GpuCorr3dMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
+                         CudaNdarray_HOST_DIMS(weights)[0], CudaNdarray_HOST_DIMS(weights)[1],
+                         CudaNdarray_HOST_DIMS(weights)[2], CudaNdarray_HOST_DIMS(weights)[3],
+                         CudaNdarray_HOST_DIMS(weights)[4],
+                         CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
+                         CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
+                         CudaNdarray_HOST_DIMS(top)[4]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
        %(fail)s
    }
    // Prepare output array
    if (!(%(out)s
          && %(out)s->nd == 5
@@ -1876,16 +1966,16 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
            raise TypeError('img must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1, 1)'
                                 ' or border_mode == "half"')
+            height_width_depth = []
+        else:
            height_width_depth = [shape[0], shape[1], shape[2]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
            assert shape[2].ndim == 0
-        else:
-            height_width_depth = []
        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False, False]
@@ -1943,10 +2033,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
            raise TypeError('kern must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) and shape is None:
+        if shape is None:
+            if self.subsample != (1, 1, 1):
                raise ValueError('shape must be given if subsample != (1, 1, 1)')
-        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
+            height_width_depth = []
-        if height_width_depth:
+        else:
+            height_width_depth = [shape[0], shape[1], shape[2]]
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
            assert shape[2].ndim == 0

--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
@@ -429,9 +429,17 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    const int dil_kW = (kW - 1) * dilW + 1;
    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidth  = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    const int topDepth  = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
+    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -478,6 +486,19 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    if (direction == 0)
    { // forward pass
      output = top;
+      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+          cudaError_t err = cudaMemset(output->devdata, 0,
+                                       CudaNdarray_SIZE(output) * sizeof(real));
+          if (err != cudaSuccess) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM could not fill the output with zeros: %s",
+                         cudaGetErrorString(err));
+            Py_DECREF(col);
+            return NULL;
+          }
+          Py_DECREF(col);
+          return output;
+      }
      // valid correlation: im2col, then gemm
      // Iterate over batch
      for (int n = 0; n < batchSize; n++)
@@ -527,6 +548,19 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    {
      // backprop wrt. weights
      output = weight;
+      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+          cudaError_t err = cudaMemset(output->devdata, 0,
+                                       CudaNdarray_SIZE(output) * sizeof(real));
+          if (err != cudaSuccess) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %s",
+                         cudaGetErrorString(err));
+            Py_DECREF(col);
+            return NULL;
+          }
+          Py_DECREF(col);
+          return output;
+      }
      // valid convolution: im2col, then gemm
      // Iterate over batch
      for (int n = 0; n < batchSize; n++)
@@ -578,6 +612,19 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    {
      // backprop wrt. inputs
      output = bottom;
+      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+          cudaError_t err = cudaMemset(output->devdata, 0,
+                                       CudaNdarray_SIZE(output) * sizeof(real));
+          if (err != cudaSuccess) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %s",
+                         cudaGetErrorString(err));
+            Py_DECREF(col);
+            return NULL;
+          }
+          Py_DECREF(col);
+          return output;
+      }
      // full convolution: gemm, then col2im3d
      // Iterate over batch
      for (int n = 0; n < batchSize; n++)

--- a/theano/sandbox/cuda/corr_gemm.cu
+++ b/theano/sandbox/cuda/corr_gemm.cu
@@ -333,8 +333,15 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -377,6 +384,19 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    CudaNdarray *output;
    if (direction == 0) {  // forward pass
        output = top;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            cudaError_t err = cudaMemset(output->devdata, 0,
+                                         CudaNdarray_SIZE(output) * sizeof(real));
+            if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM could not fill the output with zeros: %s",
+                             cudaGetErrorString(err));
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid correlation: im2col, then gemm
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {
@@ -445,6 +465,19 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            cudaError_t err = cudaMemset(output->devdata, 0,
+                                         CudaNdarray_SIZE(output) * sizeof(real));
+            if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %s",
+                             cudaGetErrorString(err));
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid convolution: im2col, then gemm
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {
@@ -513,6 +546,19 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            cudaError_t err = cudaMemset(output->devdata, 0,
+                                         CudaNdarray_SIZE(output) * sizeof(real));
+            if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %s",
+                             cudaGetErrorString(err));
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -14,7 +14,8 @@ from theano.gof.type import CDataType
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
-from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
+                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -1132,10 +1133,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        out_shp = (shape_i(kerns, 1, fgraph),
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+                   shape_i(img, 1, fgraph),
-        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
-                              shape_i(img, 1, fgraph), shape2, shape3)
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross', precision=precision)(img.shape,
                                                                      out.shape)
@@ -1149,10 +1152,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        img = gpu_contiguous(img)
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        out_shp = (shape_i(img, 0, fgraph),
-        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
+                   shape_i(kerns, 1, fgraph),
-        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
+                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
-                              shape_i(kerns, 1, fgraph), shape2, shape3)
+                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode, precision=precision)(out.shape,
                                                                        kerns.shape)
@@ -1170,6 +1175,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)
@@ -1248,11 +1254,13 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        out_shp = (shape_i(kerns, 1, fgraph),
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+                   shape_i(img, 1, fgraph),
-        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
-        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1,
-                              shape_i(img, 1, fgraph), shape2, shape3, shape4)
+                   shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
                              conv_mode='cross', precision=precision)(img.shape,
                                                                      out.shape)
@@ -1271,6 +1279,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
                                         desc_op.border_mode,
                                         desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(*out_shp)
    return GpuDnnConv3d(algo=algo)(img, kerns, out, desc)

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -12,11 +12,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    return 1;
  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
  int nb_dim = CudaNdarray_NDIM(input);
 #ifdef CONV_INPLACE
@@ -30,6 +25,22 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    return 1;
 #endif
+  if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
+    cudaError_t err2 = cudaMemset((*output)->devdata, 0,
+                                  CudaNdarray_SIZE(*output) * sizeof(real));
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv could not fill the output with zeros: %s",
+                   cudaGetErrorString(err2));
+      return 1;
+    }
+    return 0;
+  }
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -12,11 +12,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
  }
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
  int nb_dim = CudaNdarray_NDIM(output);
 #ifdef CONV_INPLACE
@@ -30,9 +25,64 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
 #endif
+  if (CudaNdarray_DIMS(im)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
+    cudaError_t err2 = cudaMemset((*input)->devdata, 0,
+                                  CudaNdarray_SIZE(*input) * sizeof(real));
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %s",
+                   cudaGetErrorString(err2));
+      return 1;
+    }
+    return 0;
+  }
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;
+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              nb_dim, expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (nb_dim == 4) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
+      return 1;
+    }
+  } else if (nb_dim == 5) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
+        (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)expected_output_dims[4],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[4]);
+      return 1;
+    }
+  }
  {
    size_t worksize;
    void *workspace;

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -12,11 +12,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    return 1;
  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
  int nb_dim = CudaNdarray_NDIM(output);
 #ifdef CONV_INPLACE
@@ -30,9 +25,64 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    return 1;
 #endif
+  if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(km)[0] == 0 || CudaNdarray_DIMS(km)[1] == 0) {
+    cudaError_t err2 = cudaMemset((*kerns)->devdata, 0,
+                                  CudaNdarray_SIZE(*kerns) * sizeof(real));
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv grad wrt. weights could not fill the output with zeros: %s",
+                   cudaGetErrorString(err2));
+      return 1;
+    }
+    return 0;
+  }
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              nb_dim, expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (nb_dim == 4) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
+                                     " but received gradient with shape %ldx%ldx%dx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
+      return 1;
+    }
+  } else if (nb_dim == 5) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
+        (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)expected_output_dims[4],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[4]);
+      return 1;
+    }
+  }
  {
    size_t worksize;
    void *workspace;

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -13,6 +13,7 @@ from theano.sandbox.cuda.blas import (
    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from nose.plugins.skip import SkipTest
+from nose.tools import assert_raises
 import theano.sandbox.cuda as cuda
 if not cuda.cuda_available:
@@ -57,6 +58,31 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_flip=flip, target_op=GpuDnnConvGradI,
                           filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
 class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -91,6 +117,31 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
                           filter_flip=flip, target_op=GpuDnnConv3dGradI,
                           filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        if fd != (1, 1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
    @classmethod
@@ -124,6 +175,28 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
                           target_op=GpuCorrMM_gradInputs,
                           filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorrMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorrMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
 class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -157,6 +230,28 @@ class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
                           target_op=GpuCorr3dMM_gradInputs,
                           filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorr3dMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorr3dMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -4,6 +4,7 @@ import os
 import sys
 from nose.plugins.skip import SkipTest
+from nose_parameterized import parameterized
 from itertools import chain, product
 import six.moves.cPickle as pickle
 from six import StringIO
@@ -16,6 +17,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -979,99 +981,105 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnConv3d
            )
-    def test_conv_gradw(self):
+    def _test_conv_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)
-        img = T.ftensor4('img')
-        kerns = T.ftensor4('kerns')
+        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
-        out = T.ftensor4('out')
+                                              border_mode, subsample)
        img_val = numpy.asarray(
-            numpy.random.rand(2, 5, 6, 8),
+            numpy.random.rand(*img_shape),
-            dtype='float32'
+            dtype=theano.config.floatX
        )
-        kern_vals = numpy.asarray(
+        topgrad_vals = numpy.asarray(
-            numpy.random.rand(2, 1, 5, 6),
+            numpy.random.rand(*topgrad_shape),
-            dtype='float32'
+            dtype=theano.config.floatX
        )
-        for params in product(
+        kerns_vals = numpy.zeros(kerns_shape, dtype=theano.config.floatX)
-            ['valid', 'full', 'half'],
+        kerns_shape = theano.shared(numpy.asarray(kerns_shape))
-            [(1, 1)],  # strides besides (1, 1)
+        topgrad_shape = theano.shared(numpy.asarray(topgrad_shape))
-            ['conv', 'cross']
-        ):
-            temp_img = img.dimshuffle(1, 0, 2, 3)
-            temp_kerns = kerns
-            if params[2] == 'conv':
-                temp_kerns = temp_kerns[:, :, ::-1, ::-1]
-            temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
-            shape = (
-                kern_vals.shape[1], img_val.shape[1],
-                img_val.shape[2] - kern_vals.shape[2] + 1,
-                img_val.shape[3] - kern_vals.shape[3] + 1
-            )
-            out_vals = numpy.zeros(shape, dtype='float32')
        desc = dnn.GpuDnnConvDesc(
-                border_mode=params[0],
+            border_mode=border_mode,
-                subsample=params[1],
+            subsample=subsample,
-                conv_mode=params[2]
+            conv_mode=conv_mode
-            )(temp_img.shape, out.shape)
+        )(topgrad_shape, kerns_shape)
        conv_grad_w = dnn.GpuDnnConvGradW()(
-                temp_img,
+            img,
-                temp_kerns,
+            topgrad,
-                out,
+            kerns,
            desc,
        )
        self._compile_and_check(
-                [temp_img, temp_kerns, out],
+            [img, topgrad, kerns],
            [conv_grad_w],
-                [img_val, kern_vals, out_vals],
+            [img_val, topgrad_vals, kerns_vals],
            dnn.GpuDnnConvGradW
        )
-    def test_conv3d_gradw(self):
+    border_modes = ['valid', 'full', 'half']
+    conv_modes = ['conv', 'cross']
+    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
+    def test_conv_gradw(self, border_mode, conv_mode):
+        self._test_conv_gradw(T.tensor4('img'),
+                              T.tensor4('topgrad'),
+                              T.tensor4('kerns'),
+                              (5, 2, 6, 13),
+                              (1, 2, 3, 7),
+                              border_mode,
+                              conv_mode,
+                              (1, 1))
+    def _test_conv3d_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
            raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
-        img = T.ftensor5('img')
-        kerns = T.ftensor5('kerns')
+        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
-        out = T.ftensor5('out')
+                                              border_mode, subsample)
        img_val = numpy.asarray(
-            numpy.random.rand(9, 2, 4, 8, 13),
+            numpy.random.rand(*img_shape),
-            dtype='float32'
+            dtype=theano.config.floatX
        )
-        kern_vals = numpy.asarray(
+        topgrad_vals = numpy.asarray(
-            numpy.random.rand(11, 2, 3, 1, 4),
+            numpy.random.rand(*topgrad_shape),
-            dtype='float32'
+            dtype=theano.config.floatX
        )
-        for params in product(
+        kerns_vals = numpy.zeros(kerns_shape, dtype=theano.config.floatX)
-            ['valid', 'full', 'half'],
+        kerns_shape = theano.shared(numpy.asarray(kerns_shape))
-            [(1, 1, 1), (2, 2, 2)],
+        topgrad_shape = theano.shared(numpy.asarray(topgrad_shape))
-            ['conv', 'cross']
-        ):
-            out_vals = numpy.zeros(
-                dnn.GpuDnnConv3d.get_out_shape(img_val.shape, kern_vals.shape,
-                                               border_mode=params[0],
-                                               subsample=params[1]),
-                dtype='float32')
        desc = dnn.GpuDnnConvDesc(
-                border_mode=params[0],
+            border_mode=border_mode,
-                subsample=params[1],
+            subsample=subsample,
-                conv_mode=params[2]
+            conv_mode=conv_mode
-            )(img.shape, out.shape)
+        )(topgrad_shape, kerns_shape)
        conv_grad_w = dnn.GpuDnnConv3dGradW()(
            img,
-                out,
+            topgrad,
            kerns,
            desc,
        )
        self._compile_and_check(
-                [img, out, kerns],
+            [img, topgrad, kerns],
            [conv_grad_w],
-                [img_val, out_vals, kern_vals],
+            [img_val, topgrad_vals, kerns_vals],
            dnn.GpuDnnConv3dGradW
        )
+    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
+    def test_conv3d_gradw(self, border_mode, conv_mode):
+        self._test_conv3d_gradw(T.tensor5('img'),
+                                T.tensor5('topgrad'),
+                                T.tensor5('kerns'),
+                                (5, 2, 6, 13, 21),
+                                (1, 2, 3, 7, 9),
+                                border_mode,
+                                conv_mode,
+                                (1, 1, 1))
    def test_conv_gradi(self):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -11,6 +11,7 @@ import theano
 from theano.tensor import as_tensor_variable, patternbroadcast
 from theano.tensor import get_scalar_constant_value, NotScalarConstantError
+from theano.tensor.opt import Assert
 from theano.gof import Apply, Op
 from six.moves import xrange
@@ -51,11 +52,11 @@ def get_conv_output_shape(image_shape, kernel_shape,
        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
        If it is a tuple, its two (or three) elements respectively correspond
        to the padding on height and width (and possibly depth) axis.
-    subsample: tuple of int (symbolic or numeric). Its or three elements
+    subsample: tuple of int (symbolic or numeric). Its two or three elements
        espectively correspond to the subsampling on height and width (and
        possibly depth) axis.
-    filter_dilation: tuple of int (symbolic or numeric). Its two elements
+    filter_dilation: tuple of int (symbolic or numeric). Its two or three
-        correspond respectively to the dilation on height and width axis.
+        elements correspond respectively to the dilation on height and width axis.
    Returns
    -------
@@ -137,6 +138,374 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
    return out_shp
+def get_conv_gradweights_shape(image_shape, top_shape,
+                               border_mode, subsample,
+                               filter_dilation=None):
+    """
+    This function tries to compute the kernel shape of convolution gradWeights.
+    The weights shape can only be computed exactly when subsample is 1 and
+    border_mode is not 'half'. If subsample is not 1 or border_mode is 'half',
+    this function will return None.
+    Parameters
+    ----------
+    image_shape: tuple of int corresponding to the input image shape. Its
+        four (or five) elements must correspond respectively to: batch size,
+        number of output channels, height and width of the image. None where
+        undefined.
+    top_shape: tuple of int (symbolic or numeric) corresponding to the top
+        image shape. Its four (or five) element must correspond respectively
+        to: batch size, number of output channels, height and width (and
+        possibly depth) of the image. None where undefined.
+    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
+        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
+        If it is a tuple, its two (or three) elements respectively correspond
+        to the padding on height and width (and possibly depth) axis.
+    subsample: tuple of int (symbolic or numeric). Its two or three elements
+        respectively correspond to the subsampling on height and width (and
+        possibly depth) axis.
+    filter_dilation: tuple of int (symbolic or numeric). Its two or three
+        elements correspond respectively to the dilation on height and
+        width axis.
+    Returns
+    -------
+    kernel_shape: tuple of int (symbolic or numeric) corresponding to the
+        kernel shape. Its four (or five) elements correspond respectively
+        to: number of output channels, number of input channels, height and
+        width (and possibly depth) of the kernel. None where undefined.
+    """
+    nkern, imshp = image_shape[1], image_shape[2:]
+    nchan, topshp = top_shape[1], top_shape[2:]
+    if filter_dilation is None:
+        filter_dilation = numpy.ones(len(subsample), dtype='int')
+    if isinstance(border_mode, tuple):
+        out_shp = tuple(get_conv_gradweights_shape_1axis(
+            imshp[i], topshp[i], border_mode[i],
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
+    else:
+        out_shp = tuple(get_conv_gradweights_shape_1axis(
+            imshp[i], topshp[i], border_mode,
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
+    return (nchan, nkern) + out_shp
+def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
+                                     subsample, dilation):
+    """
+    This function tries to compute the image shape of convolution gradWeights.
+    The weights shape can only be computed exactly when subsample is 1 and
+    border_mode is not 'half'. If subsample is not 1 or border_mode is 'half',
+    this function will return None.
+    Parameters
+    ----------
+    image_shape: int or None. Corresponds to the input image shape on a
+        given axis. None if undefined.
+    top_shape: int or None. Corresponds to the top shape on a given axis.
+        None if undefined.
+    border_mode: string or int. If it is a string, it must be
+        'valid', 'half' or 'full'. If it is an integer, it must correspond to
+        the padding on the considered axis.
+    subsample: int. It must correspond to the subsampling on the
+        considered axis.
+    dilation: int. It must correspond to the dilation on the
+        considered axis.
+    Returns
+    -------
+    kernel_shape: int or None. Corresponds to the kernel shape on a given
+        axis. None if undefined.
+    """
+    if None in [image_shape, top_shape, border_mode,
+                subsample, dilation]:
+        return None
+    if subsample != 1 or border_mode == "half":
+        return None
+    if border_mode == "full":
+        kernel_shape = top_shape - image_shape
+    elif border_mode == "valid":
+        kernel_shape = image_shape - top_shape
+    else:
+        if border_mode < 0:
+            raise ValueError("border_mode must be >= 0")
+        kernel_shape = (image_shape + 2 * border_mode - top_shape)
+    if dilation > 1:
+        kernel_shape = kernel_shape / dilation
+    return kernel_shape + 1
+def get_conv_gradinputs_shape(kernel_shape, top_shape,
+                              border_mode, subsample,
+                              filter_dilation=None):
+    """
+    This function tries to compute the image shape of convolution gradInputs.
+    The image shape can only be computed exactly when subsample is 1.
+    If subsample for a dimension is not 1, this function will return None for
+    that dimension.
+    Parameters
+    ----------
+    kernel_shape: tuple of int (symbolic or numeric) corresponding to the
+        kernel shape. Its four (or five) elements must correspond respectively
+        to: number of output channels, number of input channels, height and
+        width (and possibly depth) of the kernel. None where undefined.
+    top_shape: tuple of int (symbolic or numeric) corresponding to the top
+        image shape. Its four (or five) element must correspond respectively
+        to: batch size, number of output channels, height and width (and
+        possibly depth) of the image. None where undefined.
+    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
+        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
+        If it is a tuple, its two (or three) elements respectively correspond
+        to the padding on height and width (and possibly depth) axis.
+    subsample: tuple of int (symbolic or numeric). Its two or three elements
+        respectively correspond to the subsampling on height and width (and
+        possibly depth) axis.
+    filter_dilation: tuple of int (symbolic or numeric). Its two or three
+        elements correspond respectively to the dilation on height and
+        width axis.
+    Returns
+    -------
+    image_shape: tuple of int corresponding to the input image shape. Its
+        four element must correspond respectively to: batch size, number of
+        output channels, height and width of the image. None where undefined.
+    """
+    bsize, topshp = top_shape[0], top_shape[2:]
+    nkern, kshp = kernel_shape[1], kernel_shape[2:]
+    if filter_dilation is None:
+        filter_dilation = numpy.ones(len(subsample), dtype='int')
+    if isinstance(border_mode, tuple):
+        out_shp = tuple(get_conv_gradinputs_shape_1axis(
+            kshp[i], topshp[i], border_mode[i],
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
+    else:
+        out_shp = tuple(get_conv_gradinputs_shape_1axis(
+            kshp[i], topshp[i], border_mode,
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
+    return (bsize, nkern) + out_shp
+def get_conv_gradinputs_shape_1axis(kernel_shape, top_shape, border_mode,
+                                    subsample, dilation):
+    """
+    This function tries to compute the image shape of convolution gradInputs.
+    The image shape can only be computed exactly when subsample is 1.
+    If subsample is not 1, this function will return None.
+    Parameters
+    ----------
+    kernel_shape: int or None. Corresponds to the kernel shape on a given
+        axis. None if undefined.
+    top_shape: int or None. Corresponds to the top shape on a given axis.
+        None if undefined.
+    border_mode: string or int. If it is a string, it must be
+        'valid', 'half' or 'full'. If it is an integer, it must correspond to
+        the padding on the considered axis.
+    subsample: int. It must correspond to the subsampling on the
+        considered axis.
+    dilation: int. It must correspond to the dilation on the
+        considered axis.
+    Returns
+    -------
+    image_shape: int or None. Corresponds to the input image shape on a
+        given axis. None if undefined.
+    """
+    if None in [kernel_shape, top_shape, border_mode,
+                subsample, dilation]:
+        return None
+    if subsample != 1:
+        return None
+    # Implicit dilated kernel shape
+    dil_kernel_shape = (kernel_shape - 1) * dilation + 1
+    if border_mode == "half":
+        pad = dil_kernel_shape // 2
+    elif border_mode == "full":
+        pad = dil_kernel_shape - 1
+    elif border_mode == "valid":
+        pad = 0
+    else:
+        pad = border_mode
+        if pad < 0:
+            raise ValueError("border_mode must be >= 0")
+    # In case of symbolic shape, we want to build the smallest graph
+    # image_shape = (top_shape - 1) * s - 2 * pad + dil_kernel_shape + a
+    # where 0 <= a < subsample, but we have checked that subsample == 1
+    if pad == 0:
+        image_shape = (top_shape + dil_kernel_shape - 1)
+    else:
+        image_shape = (top_shape - 2 * pad + dil_kernel_shape - 1)
+    return image_shape
+def check_conv_gradinputs_shape(image_shape, kernel_shape, output_shape,
+                                border_mode, subsample,
+                                filter_dilation=None):
+    """
+    This function checks if the given image shapes are consistent.
+    Parameters
+    ----------
+    image_shape: tuple of int (symbolic or numeric) corresponding to the input
+        image shape. Its four (or five) element must correspond respectively
+        to: batch size, number of input channels, height and width (and
+        possibly depth) of the image. None where undefined.
+    kernel_shape: tuple of int (symbolic or numeric) corresponding to the
+        kernel shape. Its four (or five) elements must correspond respectively
+        to: number of output channels, number of input channels, height and
+        width (and possibly depth) of the kernel. None where undefined.
+    output_shape: tuple of int (symbolic or numeric) corresponding to the
+        output shape. Its four (or five) elements must correspond respectively
+        to: batch size, number of output channels, height and width
+        (and possibly depth) of the output. None where undefined.
+    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
+        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
+        If it is a tuple, its two (or three) elements respectively correspond
+        to the padding on height and width (and possibly depth) axis.
+    subsample: tuple of int (symbolic or numeric). Its two or three elements
+        respectively correspond to the subsampling on height and width (and
+        possibly depth) axis.
+    filter_dilation: tuple of int (symbolic or numeric). Its two or three
+        elements correspond respectively to the dilation on height and
+        width axis.
+    Returns
+    -------
+    Returns False if a convolution with the given input shape, kernel shape
+    and parameters would not have produced the given output shape.
+    Returns True in all other cases: if the given output shape matches the
+    computed output shape, but also if the shape could not be checked because
+    because the shape contains symbolic values.
+    """
+    image_shape = tuple(image_shape)
+    kernel_shape = tuple(kernel_shape)
+    output_shape = tuple(output_shape)
+    if len(image_shape) != len(kernel_shape) or len(image_shape) != len(output_shape):
+        return False
+    if len(image_shape) - 2 != len(subsample):
+        return False
+    if filter_dilation is not None and len(image_shape) - 2 != len(filter_dilation):
+        return False
+    # compute the predicted output shape
+    computed_output_shape = get_conv_output_shape(
+        image_shape, kernel_shape, border_mode, subsample, filter_dilation)
+    # check if the given output shape matches the computed shape
+    def check_dim(given, computed):
+        if given is None or computed is None:
+            return True
+        try:
+            given = get_scalar_constant_value(given)
+            computed = get_scalar_constant_value(computed)
+            return int(given) == int(computed)
+        except NotScalarConstantError:
+            # no answer possible, accept for now
+            return True
+    return all(check_dim(given, computed)
+               for (given, computed) in zip(output_shape, computed_output_shape))
+def assert_conv_shape(shape):
+    """This function adds Assert nodes that check if shape is a valid convolution shape.
+    The first two dimensions should be larger than or equal to zero. The convolution
+    dimensions should be larger than zero.
+    Parameters
+    ----------
+    shape: tuple of int (symbolic or numeric) corresponding to the input, output or
+        kernel shape of a convolution. For input and output, the first elements should
+        should be the batch size and number of channels. For kernels, the first and
+        second elements should contain the number of input and output channels.
+        The remaining dimensions are the convolution dimensions.
+    Returns
+    -------
+    Returns a tuple similar to the given `shape`. For constant elements in `shape`,
+    the function checks the value and raises a `ValueError` if the dimension is invalid.
+    The elements that are not constant are wrapped in an `Assert` op that checks the
+    dimension at run time.
+    """
+    out_shape = []
+    for i, n in enumerate(shape):
+        try:
+            const_n = get_scalar_constant_value(n)
+            if i < 2:
+                if const_n < 0:
+                    raise ValueError('The convolution would produce an invalid shape (dim[%d]: %d < 0).' % (i, const_n))
+            else:
+                if const_n <= 0:
+                    raise ValueError('The convolution would produce an invalid shape (dim[%d]: %d <= 0).' % (i, const_n))
+            out_shape.append(n)
+        except NotScalarConstantError:
+            if i < 2:
+                assert_shp = Assert('The convolution would produce an invalid shape (dim[%d] < 0).' % i)
+                out_shape.append(assert_shp(n, theano.tensor.ge(n, 0)))
+            else:
+                assert_shp = Assert('The convolution would produce an invalid shape (dim[%d] <= 0).' % i)
+                out_shape.append(assert_shp(n, theano.tensor.gt(n, 0)))
+    return tuple(out_shape)
+def assert_shape(x, expected_shape, msg='Unexpected shape.'):
+    """Wraps `x` in an `Assert` to check its shape.
+    Parameters
+    ----------
+    x : Tensor
+        x will be wrapped in an `Assert`.
+    expected_shape : tuple or list
+        The expected shape of `x`. The size of a dimension can be None,
+        which means it will not be checked.
+    msg : str
+        The error message of the `Assert`.
+    Returns
+    -------
+    Tensor
+        `x` wrapped in an `Assert`. At execution time, this will throw an
+        AssertionError if the shape of `x` does not match `expected_shape`.
+        If `expected_shape` is None or contains only Nones, the function
+        will return `x` directly.
+    """
+    if expected_shape is None:
+        return x
+    shape = x.shape
+    tests = []
+    for i in range(x.ndim):
+        if expected_shape[i] is not None:
+            tests.append(theano.tensor.eq(shape[i], expected_shape[i]))
+    if tests:
+        return Assert(msg)(x, *tests)
+    else:
+        return x
 def conv2d(input,
           filters,
           input_shape=None,
@@ -782,7 +1151,7 @@ def conv3d_grad_wrt_weights(input,
                                               filter_flip=filter_flip,
                                               filter_dilation=filter_dilation)
-    return gradWeight_op(input, output_grad, filter_shape[:-3])
+    return gradWeight_op(input, output_grad, filter_shape[-3:])
 def bilinear_kernel_2D(ratio, normalize=True):
@@ -1209,6 +1578,13 @@ class AbstractConv(BaseAbstractConv):
        if kern.type.ndim != 2 + self.convdim:
            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+        img = assert_shape(img, self.imshp,
+                           'AbstractConv shape mismatch: shape of '
+                           'image does not match given imshp.')
+        kern = assert_shape(kern, self.kshp,
+                            'AbstractConv shape mismatch: shape of '
+                            'filters does not match given kshp.')
        broadcastable = [img.broadcastable[0],
                         kern.broadcastable[0]] + ([False] * self.convdim)
        output = img.type.clone(broadcastable=broadcastable)()
@@ -1420,6 +1796,10 @@ class AbstractConv_gradWeights(BaseAbstractConv):
        if topgrad.type.ndim != 2 + self.convdim:
            raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
+        img = assert_shape(img, self.imshp,
+                           'AbstractConv_gradWeights shape mismatch: shape of '
+                           'image does not match given imshp.')
        shape = as_tensor_variable(shape)
        broadcastable = [topgrad.broadcastable[1],
                         img.broadcastable[1]] + ([False] * self.convdim)
@@ -1655,6 +2035,10 @@ class AbstractConv_gradInputs(BaseAbstractConv):
        if topgrad.type.ndim != 2 + self.convdim:
            raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
+        kern = assert_shape(kern, self.kshp,
+                            'AbstractConv_gradInputs shape mismatch: shape of '
+                            'filters does not match given kshp.')
        shape = as_tensor_variable(shape)
        broadcastable = [topgrad.type.broadcastable[0],
                         kern.type.broadcastable[1]] + ([False] * self.convdim)
@@ -1675,6 +2059,21 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))
+        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
+        fallback_imshp = ([topgrad.shape[0], kern.shape[1]] +
+                          [shape[i] for i in range(self.convdim)])
+        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
+                 for i in range(2 + self.convdim)]
+        expected_topgrad_shape = get_conv_output_shape(
+            imshp, kern.shape,
+            self.border_mode, self.subsample, self.filter_dilation)
+        if not tuple(expected_topgrad_shape) == tuple(topgrad.shape):
+            raise ValueError(
+                'invalid input_shape for gradInputs: the given input_shape '
+                'would produce an output of shape {}, but the given topgrad '
+                'has shape {}'.format(tuple(expected_topgrad_shape),
+                                      tuple(topgrad.shape)))
        dil_kernshp = tuple((kern.shape[i + 2] - 1) * self.filter_dilation[i] + 1
                            for i in range(self.convdim))
        pad = (0,) * self.convdim

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -123,7 +123,7 @@ class BaseCorrMM(gof.OpenMPOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (1, self.openmp, blas_header_version())
+        return (5, self.openmp, blas_header_version())
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -234,17 +234,17 @@ class BaseCorrMM(gof.OpenMPOp):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when border_mode="half", we cannot infer the weight size.
-        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+        if height:
-            if not height:
-                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
        else:
+            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
            height = '-1'
-        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+        if width:
-            if not width:
-                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
        else:
+            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
            width = '-1'
        sub = sub.copy()
        sub.update(locals())
@@ -268,15 +268,15 @@ class BaseCorrMM(gof.OpenMPOp):
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW;
+    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyArray_DIMS(weights)[2];
        kW = PyArray_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
+        if (%(height)s != -1) {
-            // vertical subsampling or half padding, kernel height is specified
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -287,7 +287,8 @@ class BaseCorrMM(gof.OpenMPOp):
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
+            // kernel width is specified (perhaps horizontal subsampling or half padding)
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -299,8 +300,8 @@ class BaseCorrMM(gof.OpenMPOp):
    }
    // Implicit dilated kernel size
-    int dil_kH = (kH - 1) * dilH + 1;
+    dil_kH = (kH - 1) * dilH + 1;
-    int dil_kW = (kW - 1) * dilW + 1;
+    dil_kW = (kW - 1) * dilW + 1;
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
@@ -334,6 +335,21 @@ class BaseCorrMM(gof.OpenMPOp):
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "CorrMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
@@ -342,14 +358,44 @@ class BaseCorrMM(gof.OpenMPOp):
        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
        out_dim[2] = (npy_intp)kH;  // already inferred further above
        out_dim[3] = (npy_intp)kW;  // how convenient
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "CorrMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3],
+                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "CorrMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3],
+                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
@@ -491,13 +537,13 @@ class CorrMM_gradWeights(BaseCorrMM):
            raise TypeError('img must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1)'
                                 ' or border_mode == "half"')
-            height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]
-        else:
            height_width = []
+        else:
+            height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]
        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False]
@@ -588,9 +634,13 @@ class CorrMM_gradInputs(BaseCorrMM):
            raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) and shape is None:
+        if shape is None:
+            if self.subsample != (1, 1):
                raise ValueError('shape must be given if subsample != (1, 1)')
-        height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')] if self.subsample != (1, 1) else []
+            height_width = []
+        else:
+            height_width = [as_tensor_variable(shape[0]).astype('int64'),
+                            as_tensor_variable(shape[1]).astype('int64')]
        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
                         False, False]

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -123,7 +123,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (1, self.openmp, blas_header_version())
+        return (5, self.openmp, blas_header_version())
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -241,23 +241,23 @@ class BaseCorr3dMM(gof.OpenMPOp):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when border_mode="half", we cannot infer the weight size.
-        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+        if height:
-            if not height:
-                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
        else:
+            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
            height = '-1'
-        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+        if width:
-            if not width:
-                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
        else:
+            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
            width = '-1'
-        if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
+        if depth:
-            if not depth:
-                raise ValueError("depth must be given for backprop with depth sampling or border_mode='half'")
            depth = '(*(npy_int64 *)(PyArray_DATA(%s)))' % depth
        else:
+            if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
+                raise ValueError("depth must be given for backprop with depth sampling or border_mode='half'")
            depth = '-1'
        sub = sub.copy()
        sub.update(locals())
@@ -284,7 +284,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
    // Obtain or infer kernel width, height and depth
    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW, kD;
+    int kH, kW, kD, dil_kH, dil_kW, dil_kD;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyArray_DIMS(weights)[2];
@@ -292,8 +292,8 @@ class BaseCorr3dMM(gof.OpenMPOp):
        kD = PyArray_DIMS(weights)[4];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
+        if (%(height)s != -1) {
-            // vertical subsampling or half padding, kernel height is specified
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -304,7 +304,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -313,7 +313,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
        else {
            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((dD != 1) || (padD == -1)) {
+        if (%(depth)s != -1) {
            kD = %(depth)s;
        }
        else if (padD == -2) {
@@ -325,9 +325,9 @@ class BaseCorr3dMM(gof.OpenMPOp):
    }
    // Implicit dilated kernel size
-    int dil_kH = (kH - 1) * dilH + 1;
+    dil_kH = (kH - 1) * dilH + 1;
-    int dil_kW = (kW - 1) * dilW + 1;
+    dil_kW = (kW - 1) * dilW + 1;
-    int dil_kD = (kD - 1) * dilD + 1;
+    dil_kD = (kD - 1) * dilD + 1;
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
@@ -372,6 +372,23 @@ class BaseCorr3dMM(gof.OpenMPOp):
        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
        out_dim[4] = (npy_intp)((PyArray_DIMS(bottom)[4] + 2*padD - ((PyArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1);
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "Corr3dMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                         (long int)PyArray_DIMS(bottom)[4],
+                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                         (long int)PyArray_DIMS(weights)[4],
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3], (long int)out_dim[4]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width, depth)
@@ -381,15 +398,49 @@ class BaseCorr3dMM(gof.OpenMPOp):
        out_dim[2] = (npy_intp)kH;  // already inferred further above
        out_dim[3] = (npy_intp)kW;  // how convenient
        out_dim[4] = (npy_intp)kD;
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "Corr3dMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                         (long int)PyArray_DIMS(bottom)[4],
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3], (long int)out_dim[4],
+                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3],
+                         (long int)PyArray_DIMS(top)[4]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width, depth)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
-        out_dim[4] = (npy_intp)((dD != 1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
+        out_dim[4] = (npy_intp)((%(depth)s != -1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "Corr3dMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3], (long int)out_dim[4],
+                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                         (long int)PyArray_DIMS(weights)[4],
+                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3],
+                         (long int)PyArray_DIMS(top)[4]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n");
@@ -533,15 +584,15 @@ class Corr3dMM_gradWeights(BaseCorr3dMM):
            raise TypeError('img must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1, 1)'
                                 ' or border_mode == "half"')
+            height_width_depth = []
+        else:
            height_width_depth = [as_tensor_variable(shape[0]).astype('int64'),
                                  as_tensor_variable(shape[1]).astype('int64'),
                                  as_tensor_variable(shape[2]).astype('int64')]
-        else:
-            height_width_depth = []
        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False, False]
@@ -638,14 +689,14 @@ class Corr3dMM_gradInputs(BaseCorr3dMM):
            raise TypeError('kern must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) and shape is None:
+        if shape is None:
-            raise ValueError('shape must be given if subsample != (1, 1, 1)')
            if self.subsample != (1, 1, 1):
+                raise ValueError('shape must be given if subsample != (1, 1, 1)')
+            height_width_depth = []
+        else:
            height_width_depth = [as_tensor_variable(shape[0]).astype('int64'),
                                  as_tensor_variable(shape[1]).astype('int64'),
                                  as_tensor_variable(shape[2]).astype('int64')]
-        else:
-            height_width_depth = []
        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
                         False, False, False]

--- a/theano/tensor/nnet/corr3d_gemm.c
+++ b/theano/tensor/nnet/corr3d_gemm.c
@@ -188,9 +188,17 @@ PyArrayObject* corr3dMM(PyArrayObject* bottom,
    const int dil_kW = (kW - 1) * dilW + 1;
    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    const int topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||
@@ -245,7 +253,23 @@ PyArrayObject* corr3dMM(PyArrayObject* bottom,
    char Trans = 'T';
    PyArrayObject *output;
-    if (direction == 0) {  // forward pass
+    if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+        switch(direction) {
+        case 0:
+            output = top;
+            break;
+        case 1:
+            output = weight;
+            break;
+        case 2:
+            output = bottom;
+            break;
+        default:
+            return NULL;
+        }
+        PyArray_FILLWBYTE(output, 0);
+    }
+    else if (direction == 0) {  // forward pass
        output = top;
        // valid correlation: im3d2col, then gemm
        // Iterate over batch

--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
@@ -164,8 +164,15 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||
@@ -219,7 +226,23 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    char Trans = 'T';
    PyArrayObject *output;
-    if (direction == 0) {  // forward pass
+    if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+        switch(direction) {
+        case 0:
+            output = top;
+            break;
+        case 1:
+            output = weight;
+            break;
+        case 2:
+            output = bottom;
+            break;
+        default:
+            return NULL;
+        }
+        PyArray_FILLWBYTE(output, 0);
+    }
+    else if (direction == 0) {  // forward pass
        output = top;
        // valid correlation: im2col, then gemm
        // Iterate over batch

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -10,7 +10,12 @@ from theano import tensor
 from theano.gof.opt import check_stack_trace
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, corr3d, abstract_conv as conv
-from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
+                                              get_conv_gradweights_shape,
+                                              get_conv_gradinputs_shape,
+                                              check_conv_gradinputs_shape,
+                                              assert_conv_shape,
+                                              assert_shape)
 from theano.tensor.nnet.abstract_conv import AbstractConv2d
 from theano.tensor.nnet.abstract_conv import AbstractConv2d_gradInputs
 from theano.tensor.nnet.abstract_conv import AbstractConv2d_gradWeights
@@ -133,6 +138,198 @@ class TestGetConvOutShape(unittest.TestCase):
        self.assertTrue(test4_params == (3, 4, 6, 4, 10))
+class TestConvGradInputsShape(unittest.TestCase):
+    def test_check_shape(self):
+        for i in range(1, 20):
+            for k in range(1, 10):
+                for b in ('valid', 'half', 'full', (0, 2)):
+                    for s in (1, 2, 3):
+                        for d in (1, 2, 3):
+                            image_shape = (59, 61, i, i)
+                            kernel_shape = (67, 61, k, k)
+                            # compute the output that these inputs and parameters would produce
+                            computed_shape = get_conv_output_shape(
+                                image_shape, kernel_shape, b, (s, s), (d, d))
+                            # this should be accepted
+                            self.assertTrue(check_conv_gradinputs_shape(
+                                image_shape, kernel_shape, computed_shape, b, (s, s), (d, d)))
+                            # one or more None should also be accepted
+                            trial_shape = (None, None, computed_shape[2], None)
+                            self.assertTrue(check_conv_gradinputs_shape(
+                                image_shape, kernel_shape, trial_shape, b, (s, s), (d, d)))
+                            # the batch size and number of filters are important
+                            trial_shape = (1, 1, computed_shape[2], computed_shape[3])
+                            self.assertFalse(check_conv_gradinputs_shape(
+                                image_shape, kernel_shape, trial_shape, b, (s, s), (d, d)))
+                            # outputs that are too large or too small should be rejected
+                            for o in (-3, -2, -1, 1, 2, 3):
+                                trial_shape = (computed_shape[0], computed_shape[1],
+                                               computed_shape[2] + o, computed_shape[3] + o)
+                                self.assertFalse(check_conv_gradinputs_shape(
+                                    image_shape, kernel_shape, trial_shape, b, (s, s), (d, d)))
+    def test_get_shape(self):
+        for i in range(1, 20):
+            for k in range(1, 10):
+                for b in ('valid', 'half', 'full', (0, 2)):
+                    for d in (1, 2, 3):
+                        image_shape = (59, 61, i, i)
+                        kernel_shape = (67, 61, k, k)
+                        # compute the output that these inputs and parameters would produce
+                        output_shape = get_conv_output_shape(
+                            image_shape, kernel_shape, b, (1, 1), (d, d))
+                        # compute the image_shape given this output_shape
+                        computed_image_shape = get_conv_gradinputs_shape(
+                            kernel_shape, output_shape, b, (1, 1), (d, d))
+                        self.assertEqual(computed_image_shape, image_shape)
+                        # if subsample > 1, the shape should be None
+                        computed_image_shape = get_conv_gradinputs_shape(
+                            kernel_shape, output_shape, b, (2, 3), (d, d))
+                        image_shape_with_None = image_shape[:2] + (None, None)
+                        self.assertEqual(computed_image_shape, image_shape_with_None)
+                        # compute the kernel_shape given this output_shape
+                        computed_kernel_shape = get_conv_gradweights_shape(
+                            image_shape, output_shape, b, (1, 1), (d, d))
+                        # if border_mode == 'half', the shape should be None
+                        if b == 'half':
+                            kernel_shape_with_None = kernel_shape[:2] + (None, None)
+                            self.assertEqual(computed_kernel_shape, kernel_shape_with_None)
+                        else:
+                            self.assertEqual(computed_kernel_shape, kernel_shape)
+                        # if subsample > 1, the shape should be None
+                        computed_kernel_shape = get_conv_gradweights_shape(
+                            kernel_shape, output_shape, b, (2, 3), (d, d))
+                        kernel_shape_with_None = kernel_shape[:2] + (None, None)
+                        self.assertEqual(computed_kernel_shape, kernel_shape_with_None)
+class TestAssertConvShape(unittest.TestCase):
+    def test_basic(self):
+        shape = tuple(tensor.iscalar() for i in range(4))
+        f = theano.function(shape, assert_conv_shape(shape))
+        self.assertEqual([1, 2, 3, 4], f(1, 2, 3, 4))
+        self.assertEqual([0, 0, 1, 1], f(0, 0, 1, 1))
+        assert_raises(AssertionError, f, 3, 3, 3, 0)
+        assert_raises(AssertionError, f, 3, 3, 0, 3)
+        assert_raises(AssertionError, f, 3, 3, -1, 3)
+        assert_raises(AssertionError, f, 3, -1, 3, 3)
+        assert_raises(AssertionError, f, -1, 3, 3, 3)
+class TestAssertShape(unittest.TestCase):
+    def test_basic(self):
+        x = tensor.tensor4()
+        s1 = tensor.iscalar()
+        s2 = tensor.iscalar()
+        expected_shape = [None, s1, s2, None]
+        f = theano.function([x, s1, s2], assert_shape(x, expected_shape))
+        v = numpy.zeros((3, 5, 7, 11), dtype='float32')
+        self.assertEqual(0, numpy.sum(f(v, 5, 7)))
+        assert_raises(AssertionError, f, v, 5, 0)
+        assert_raises(AssertionError, f, v, 5, 9)
+        assert_raises(AssertionError, f, v, 0, 7)
+        assert_raises(AssertionError, f, v, 7, 7)
+    def test_shape_check_conv2d(self):
+        input = tensor.tensor4()
+        filters = tensor.tensor4()
+        out = conv.conv2d(input, filters,
+                          input_shape=(3, 5, 7, 11),
+                          filter_shape=(7, 5, 3, 3))
+        f = theano.function([input, filters], out)
+        # mismatched input_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 5, 9, 11), dtype='float32'),
+                      numpy.zeros((7, 5, 3, 3), dtype='float32'))
+        # mismatched filter_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 5, 7, 11), dtype='float32'),
+                      numpy.zeros((7, 5, 2, 2), dtype='float32'))
+    def test_shape_check_conv3d(self):
+        input = tensor.tensor5()
+        filters = tensor.tensor5()
+        out = conv.conv3d(input, filters,
+                          input_shape=(3, 5, 7, 11, 13),
+                          filter_shape=(7, 5, 3, 3, 3))
+        f = theano.function([input, filters], out)
+        # mismatched input_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 5, 9, 11, 13), dtype='float32'),
+                      numpy.zeros((7, 5, 3, 3, 3), dtype='float32'))
+        # mismatched filter_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 5, 7, 11, 13), dtype='float32'),
+                      numpy.zeros((7, 5, 2, 2, 2), dtype='float32'))
+    def test_shape_check_conv2d_grad_wrt_inputs(self):
+        output_grad = tensor.tensor4()
+        filters = tensor.tensor4()
+        out = conv.conv2d_grad_wrt_inputs(output_grad, filters,
+                                          input_shape=(None, None, 7, 11),
+                                          filter_shape=(7, 5, 3, 3))
+        f = theano.function([output_grad, filters], out)
+        # mismatched filter_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 6, 5, 9), dtype='float32'),
+                      numpy.zeros((7, 6, 3, 3), dtype='float32'))
+    def test_shape_check_conv3d_grad_wrt_inputs(self):
+        output_grad = tensor.tensor5()
+        filters = tensor.tensor5()
+        out = conv.conv3d_grad_wrt_inputs(output_grad, filters,
+                                          input_shape=(None, None, 7, 11, 13),
+                                          filter_shape=(7, 5, 3, 3, 3))
+        f = theano.function([output_grad, filters], out)
+        # mismatched filter_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 6, 5, 9, 11), dtype='float32'),
+                      numpy.zeros((7, 6, 3, 3, 3), dtype='float32'))
+    def test_shape_check_conv2d_grad_wrt_weights(self):
+        input = tensor.tensor4()
+        output_grad = tensor.tensor4()
+        out = conv.conv2d_grad_wrt_weights(input, output_grad,
+                                           filter_shape=(None, None, 3, 3),
+                                           input_shape=(3, 5, 7, 11))
+        f = theano.function([input, output_grad], out)
+        # mismatched filter_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 6, 7, 11), dtype='float32'),
+                      numpy.zeros((3, 7, 5, 9), dtype='float32'))
+    def test_shape_check_conv3d_grad_wrt_weights(self):
+        input = tensor.tensor5()
+        output_grad = tensor.tensor5()
+        out = conv.conv3d_grad_wrt_weights(input, output_grad,
+                                           filter_shape=(None, None, 3, 3, 3),
+                                           input_shape=(3, 5, 7, 11, 13))
+        f = theano.function([input, output_grad], out)
+        # mismatched filter_shape
+        assert_raises(AssertionError, f,
+                      numpy.zeros((3, 6, 7, 11, 13), dtype='float32'),
+                      numpy.zeros((3, 7, 5, 9, 11), dtype='float32'))
 class BaseTestConv(object):
    def get_output_shape(self, inputs_shape, filters_shape,
                         subsample, border_mode, filter_dilation):
@@ -211,7 +408,7 @@ class BaseTestConv(object):
        res_ref = numpy.array(f_ref())
        res = numpy.array(f())
        utt.assert_allclose(res_ref, res)
-        if verify_grad:
+        if verify_grad and inputs_val.size > 0 and filters_val.size > 0 and res.size > 0:
            utt.verify_grad(conv_op(border_mode=border_mode,
                                    imshp=imshp, kshp=kshp,
                                    subsample=subsample,
@@ -277,7 +474,7 @@ class BaseTestConv(object):
                                     filter_dilation=filter_dilation)
            return conv_op(inputs_val, output_val, filters_shape[2:])
-        if verify_grad:
+        if verify_grad and inputs_val.size > 0 and output_val.size > 0 and res.size > 0:
            utt.verify_grad(abstract_conv_gradweight,
                            [inputs_val, output_val],
                            mode=mode, eps=1)
@@ -314,10 +511,15 @@ class BaseTestConv(object):
                          imshp=imshp, kshp=kshp,
                          filter_dilation=filter_dilation)
        c = c(filters, output, inputs_shape[2:])
+        f = theano.function([], c, mode=mode)
+        # ref is set to None for the inconsistent-shape tests.
+        # The reference function also raises an exception, which would
+        # mask the exception generated by the target implementation.
+        if ref is not None:
            c_ref = ref(filters, output, inputs_shape,
                        border_mode=border_mode, subsample=subsample,
                        conv_mode=conv_mode, filter_dilation=filter_dilation)
-        f = theano.function([], c, mode=mode)
            f_ref = theano.function([], c_ref, mode='FAST_RUN')
        if target_op is not None:
@@ -326,8 +528,10 @@ class BaseTestConv(object):
            if check_trace:
                assert_true(check_stack_trace(f, ops_to_check=target_op))
-        res_ref = numpy.array(f_ref())
        res = numpy.array(f())
+        if ref is not None:
+            res_ref = numpy.array(f_ref())
            utt.assert_allclose(res_ref, res)
        def abstract_conv_gradinputs(filters_val, output_val):
@@ -336,7 +540,7 @@ class BaseTestConv(object):
                                    filter_dilation=filter_dilation)
            return conv_op(filters_val, output_val, inputs_shape[2:])
-        if verify_grad:
+        if verify_grad and filters_val.size > 0 and output_val.size > 0 and res.size > 0:
            utt.verify_grad(abstract_conv_gradinputs,
                            [filters_val, output_val],
                            mode=mode, eps=1)
@@ -351,6 +555,7 @@ class BaseTestConv(object):
        for (i, f) in zip(self.inputs_shapes, self.filters_shapes):
            for provide_shape in self.provide_shape:
                yield (self.tcase, i, f, ds, db, dflip, provide_shape)
+            if min(i) > 0 and min(f) > 0:
                for fd in self.filters_dilations:
                    for s in self.subsamples:
                        for b in self.border_modes:
@@ -365,12 +570,15 @@ class BaseTestConv2d(BaseTestConv):
    def setup_class(cls):
        # This tests can run even when theano.config.blas.ldflags is empty.
        cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
-                             (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
+                             (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9),
+                             (0, 1, 6, 6), (1, 0, 6, 6), (1, 1, 6, 6)]
        cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
-                              (1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2)]
+                              (1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2),
+                              (1, 1, 2, 2), (1, 0, 2, 2), (0, 1, 2, 2)]
        cls.subsamples = [(1, 1), (2, 2), (2, 4)]
        cls.default_subsamples = (1, 1)
        cls.filters_dilations = [(1, 1), (1, 2), (2, 1)]
+        cls.default_filters_dilations = (1, 1)
        cls.border_modes = ["valid", "half", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
        cls.default_border_mode = (0, 0)
        cls.filter_flip = [True, False]
@@ -379,6 +587,62 @@ class BaseTestConv2d(BaseTestConv):
        cls.default_provide_shape = True
        cls.shared = staticmethod(theano.compile.shared)
+    def test_gradinput_arbitrary_output_shapes(self):
+        # this computes the grad wrt inputs for an output shape
+        # that the forward convolution would not produce
+        input_shape = (2, 1, 7, 7)
+        filter_shape = (2, 1, 3, 3)
+        for output_shape in [(2, 2, 8, 8), (2, 2, 9, 9), (2, 2, 12, 12)]:
+            for border_mode in ["valid", "half", "full"]:
+                computed_shape = get_conv_output_shape(
+                    input_shape, filter_shape, border_mode, self.default_subsamples, self.default_filters_dilations)
+                # is this a valid combination?
+                if tuple(computed_shape) == output_shape:
+                    yield (self.tcase_gi,
+                           input_shape,
+                           filter_shape,
+                           output_shape,
+                           self.default_subsamples,
+                           border_mode,
+                           True,
+                           True,
+                           self.default_filters_dilations,
+                           False)
+                else:
+                    # expect an error
+                    yield (self.tcase_gi,
+                           input_shape,
+                           filter_shape,
+                           output_shape,
+                           self.default_subsamples,
+                           border_mode,
+                           True,
+                           True,
+                           self.default_filters_dilations,
+                           True)
+    def test_gradinput_impossible_output_shapes(self):
+        def run_for_output_offsets(image_shape, kernel_shape, s, border_mode, d):
+            # outputs that are too large or too small should be rejected
+            for o in (-3, -1, 1, 2):
+                output_shape = (1, 1, computed_shape[2] + o, computed_shape[3] + o)
+                # expect an error
+                self.tcase_gi(image_shape, kernel_shape, output_shape,
+                              (s, s), border_mode, True, True, (d, d), True)
+        for (i, k) in ((1, 1), (1, 2), (2, 1), (4, 2), (4, 3), (7, 3), (9, 5)):
+            for border_mode in ('valid', 'half', 'full', (0, 2)):
+                for (s, d) in ((1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (1, 3)):
+                    image_shape = (1, 1, i, i)
+                    kernel_shape = (1, 1, k, k)
+                    # compute the output that these inputs and parameters would produce
+                    computed_shape = get_conv_output_shape(
+                        image_shape, kernel_shape, border_mode, (s, s), (d, d))
+                    yield (run_for_output_offsets,
+                           image_shape, kernel_shape, s, border_mode, d)
    def run_fwd(self, inputs_shape, filters_shape,
                conv_fn=conv.conv2d, conv_op=conv.AbstractConv2d,
                ref=conv2d_corr, **kwargs):
@@ -438,6 +702,26 @@ class TestCorrConv2d(BaseTestConv2d):
                           filter_flip=flip, target_op=CorrMM_gradInputs,
                           check_trace=True, filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        # This tests can run even when theano.config.blas.ldflags is empty.
+        if (not theano.config.cxx or
+                theano.config.mode == "FAST_COMPILE"):
+            raise SkipTest("Need blas to test conv2d")
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s, verify_grad=True,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=CorrMM_gradInputs,
+                               check_trace=True, filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s, verify_grad=False,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=CorrMM_gradInputs,
+                          ref=None, check_trace=True, filter_dilation=fd)
 class TestAbstractConvNoOptim(BaseTestConv2d):
    @classmethod
@@ -477,6 +761,25 @@ class TestAbstractConvNoOptim(BaseTestConv2d):
                           check_trace=True, filter_dilation=fd,
                           mode=mode)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        mode = theano.Mode(optimizer=None)
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s, verify_grad=True,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=None,
+                               check_trace=True, filter_dilation=fd,
+                               mode=mode)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s, verify_grad=False,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=None,
+                          check_trace=True, filter_dilation=fd,
+                          ref=None, mode=mode)
 class TestCpuConv2d(BaseTestConv2d):
    @classmethod
@@ -592,16 +895,47 @@ class TestCpuConv2d(BaseTestConv2d):
                          check_trace=True,
                          filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for basic cpu ConvOp.")
+        mode = self.mode
+        if not flip:
+            return
+        if b not in ((0, 0), 'valid', 'full'):
+            return
+        if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
+            return
+        if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
+            return
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=False, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=(ConvOp, ConvTransp3D),
+                               check_trace=True,
+                               filter_dilation=fd)
+        else:
+            # we do not check for inconsistent shapes,
+            # because this older implementation does not check that
+            raise SkipTest('Inconsistent shapes are not tested for old cpu ConvOp.')
 class BaseTestConv3d(BaseTestConv):
    @classmethod
    def setup_class(cls):
        # This tests can run even when theano.config.blas.ldflags is empty.
-        cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6)]
+        cls.inputs_shapes = [(2, 1, 5, 5, 5), (1, 2, 7, 5, 6),
-        cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3)]
+                             (0, 1, 5, 5, 5), (1, 0, 5, 5, 5), (1, 1, 5, 5, 5)]
+        cls.filters_shapes = [(2, 1, 2, 2, 2), (1, 2, 2, 1, 3),
+                              (1, 1, 2, 2, 2), (1, 0, 2, 2, 2), (0, 1, 2, 2, 2)]
        cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
        cls.default_subsamples = (1, 1, 1)
        cls.filters_dilations = [(1, 1, 1), (1, 2, 1), (2, 1, 2)]
+        cls.default_filters_dilations = (1, 1, 1)
        cls.border_modes = ["valid", "half", "full", (0, 0, 0), (2, 2, 3)]
        cls.default_border_mode = (0, 0, 0)
        cls.filter_flip = [True, False]
@@ -610,6 +944,64 @@ class BaseTestConv3d(BaseTestConv):
        cls.default_provide_shape = True
        cls.shared = staticmethod(theano.compile.shared)
+    def test_gradinput_arbitrary_output_shapes(self):
+        # this computes the grad wrt inputs for an output shape
+        # that the forward convolution would not produce
+        input_shape = (2, 1, 7, 7, 7)
+        filter_shape = (1, 1, 3, 3, 3)
+        for output_shape in [(2, 1, 8, 8, 8), (2, 1, 9, 9, 9), (2, 1, 12, 12, 12)]:
+            for border_mode in ["valid", "half", "full"]:
+                # compute the output that these inputs and parameters would produce
+                computed_shape = get_conv_output_shape(
+                    input_shape, filter_shape, border_mode, self.default_subsamples, self.default_filters_dilations)
+                # is this a valid combination?
+                if tuple(computed_shape) == output_shape:
+                    yield (self.tcase_gi,
+                           input_shape,
+                           filter_shape,
+                           output_shape,
+                           self.default_subsamples,
+                           border_mode,
+                           True,
+                           True,
+                           self.default_filters_dilations,
+                           False)
+                else:
+                    # expect an error
+                    yield (self.tcase_gi,
+                           input_shape,
+                           filter_shape,
+                           output_shape,
+                           self.default_subsamples,
+                           border_mode,
+                           True,
+                           True,
+                           self.default_filters_dilations,
+                           True)
+    def test_gradinput_impossible_output_shapes(self):
+        def run_for_output_offsets(image_shape, kernel_shape, s, border_mode, d):
+            # outputs that are too large or too small should be rejected
+            for o in (-3, -1, 1, 2):
+                output_shape = (1, 1, computed_shape[2] + o,
+                                computed_shape[3] + o, computed_shape[4] + o)
+                # expect an error
+                self.tcase_gi(image_shape, kernel_shape, output_shape,
+                              (s, s), border_mode, True, True, (d, d), True)
+        for (i, k) in ((1, 1), (1, 2), (2, 1), (4, 2), (4, 3), (7, 3), (9, 5)):
+            for border_mode in ('valid', 'half', 'full', (0, 2, 1)):
+                for (s, d) in ((1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (1, 3)):
+                    image_shape = (1, 1, i, i, i)
+                    kernel_shape = (1, 1, k, k, k)
+                    # compute the output that these inputs and parameters would produce
+                    computed_shape = get_conv_output_shape(
+                        image_shape, kernel_shape, border_mode, (s, s, s), (d, d, d))
+                    yield (run_for_output_offsets,
+                           image_shape, kernel_shape, s, border_mode, d)
    def run_fwd(self, inputs_shape, filters_shape,
                conv_fn=conv.conv3d, conv_op=conv.AbstractConv3d,
                ref=conv3d_corr, **kwargs):
@@ -669,6 +1061,26 @@ class TestCorrConv3d(BaseTestConv3d):
                           filter_flip=flip, target_op=Corr3dMM_gradInputs,
                           check_trace=True, filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        # This test can run even when theano.config.blas.ldflags is empty.
+        if (not theano.config.cxx or
+                theano.config.mode == "FAST_COMPILE"):
+            raise SkipTest("Need blas to test conv3d")
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s, verify_grad=True,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=Corr3dMM_gradInputs,
+                               check_trace=True, filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s, verify_grad=False,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=Corr3dMM_gradInputs,
+                          ref=None, check_trace=True, filter_dilation=fd)
 class TestCpuConv3d(BaseTestConv3d):
    @classmethod
@@ -687,6 +1099,9 @@ class TestCpuConv3d(BaseTestConv3d):
            raise SkipTest("No dilation implementation for basic cpu Conv3D.")
        if not theano.config.cxx:
            raise SkipTest("Need cxx to test conv2d")
+        if min(i) == 0 or min(f) == 0:
+            raise SkipTest('Not tested for old cpu Conv3D.')
        mode = self.mode
        o = self.get_output_shape(i, f, s, b, fd)
        fwd_OK = True
@@ -770,6 +1185,30 @@ class TestCpuConv3d(BaseTestConv3d):
                          check_trace=True,
                          filter_dilation=fd)
+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        if fd != (1, 1, 1):
+            raise SkipTest("No dilation implementation for basic cpu Conv3D.")
+        mode = self.mode
+        if min(i) == 0 or min(f) == 0 or min(o) == 0:
+            raise SkipTest('Not tested for old cpu Conv3D.')
+        if b not in ((0, 0, 0), 'valid'):
+            return
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=False, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=ConvTransp3D,
+                               check_trace=True,
+                               filter_dilation=fd)
+        else:
+            # we do not check for inconsistent shapes,
+            # because this older implementation does not check that
+            raise SkipTest('Inconsistent shapes are not tested for old cpu Conv3D.')
 def test_constant_shapes():
    # Check that the `imshp` and `kshp` parameters of the AbstractConv Ops