Merge pull request #5267 from gvtulder/f-abstractconv-differences

Minor inconsistency in AbstractConv_gradInput implementations

Merge pull request #5267 from gvtulder/f-abstractconv-differences
c072d669 · Frédéric Bastien · GitHub · 1a42bf9b · 7f1c3677 · c072d669
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
@@ -425,9 +425,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
-    const size_t topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const size_t topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const size_t topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
@@ -479,6 +487,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid correlation: im3d2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -530,6 +549,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid convolution: im3col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -581,9 +611,29 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
        }
+        if (batchSize == 0) {
+            err = GpuArray_memset(&weight->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -360,8 +360,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
@@ -411,6 +418,17 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid correlation: im2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -462,6 +480,17 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid convolution: im2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
@@ -516,6 +545,17 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            err = GpuArray_memset(&output->ga, 0);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %d", err);
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -24,7 +24,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv3d,
                                              AbstractConv3d_gradWeights,
                                              AbstractConv3d_gradInputs,
-                                              get_conv_output_shape)
+                                              get_conv_output_shape,
+                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
@@ -979,11 +980,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
-            shape_i(kerns, 1, fgraph),
-            shape_i(img, 1, fgraph), shape2, shape3)
+        out_shp = (shape_i(kerns, 1, fgraph),
+                   shape_i(img, 1, fgraph),
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
        conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
@@ -997,11 +999,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
-        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
+        out_shp = (shape_i(img, 0, fgraph),
                   shape_i(kerns, 1, fgraph),
-                                                         shape2, shape3)
+                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
+                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
@@ -1021,6 +1024,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
                                    desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)

@@ -1094,12 +1098,13 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
-            shape_i(kerns, 1, fgraph),
-            shape_i(img, 1, fgraph), shape2, shape3, shape4)
+        out_shp = (shape_i(kerns, 1, fgraph),
+                   shape_i(img, 1, fgraph),
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1,
+                   shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
        conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
@@ -1113,12 +1118,13 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
-        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
-        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
+        out_shp = (shape_i(img, 0, fgraph),
                   shape_i(kerns, 1, fgraph),
-                                                         shape2, shape3, shape4)
+                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
+                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1,
+                   shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
@@ -1138,6 +1144,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
                                    desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)


--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -39,11 +39,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 1;
  }

-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
@@ -71,6 +66,20 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 1;
 #endif

+  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
+    int err2 = GpuArray_memset(&(*output)->ga, 0);
+    if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv could not fill the output with zeros: %d", err2);
+        return 1;
+    }
+    return 0;
+  }
+
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;


--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -38,11 +38,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 1;
  }

-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
  switch (im->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
@@ -70,6 +65,20 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 1;
 #endif

+  if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
+    int err2 = GpuArray_memset(&(*input)->ga, 0);
+    if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %d", err2);
+        return 1;
+    }
+    return 0;
+  }
+
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

@@ -77,6 +86,48 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

  cuda_enter(c->ctx);

+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              PyGpuArray_NDIM(im), expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return 1;
+  }
+  if (PyGpuArray_NDIM(im) == 4) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  } else if (PyGpuArray_NDIM(im) == 5) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
+        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   expected_output_dims[4],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
+                   PyGpuArray_DIMS(output)[4]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+
 #ifdef CHOOSE_ALGO
 #ifndef CHOOSE_ONCE
  reuse_algo = 1;

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -38,11 +38,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  }

-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-
  switch (input->ga.typecode) {
  case GA_DOUBLE:
    alpha_p = (void *)&alpha;
@@ -70,6 +65,20 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
 #endif

+  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
+    int err2 = GpuArray_memset(&(*kerns)->ga, 0);
+    if (err2 != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv grad wrt. weights could not fill the output with zeros: %d", err2);
+        return 1;
+    }
+    return 0;
+  }
+
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

@@ -77,6 +86,48 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

  cuda_enter(c->ctx);

+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              PyGpuArray_NDIM(input), expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return 1;
+  }
+  if (PyGpuArray_NDIM(input) == 4) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
+                                     " but received gradient with shape %ldx%ldx%dx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  } else if (PyGpuArray_NDIM(input) == 5) {
+    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
+        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   expected_output_dims[0], expected_output_dims[1],
+                   expected_output_dims[2], expected_output_dims[3],
+                   expected_output_dims[4],
+                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
+                   PyGpuArray_DIMS(output)[4]);
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+
 #ifdef CHOOSE_ALGO
 #ifndef CHOOSE_ONCE
  reuse_algo = 1;

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
 from __future__ import absolute_import, print_function, division

 from nose.plugins.skip import SkipTest
+from nose.tools import assert_raises

 import numpy

@@ -49,6 +50,31 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip, target_op=GpuDnnConvGradI)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        if fd != (1, 1):
+            raise SkipTest("Doesn't have CUDNN implementation")
+        mode = mode_with_gpu
+
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -82,6 +108,31 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip, target_op=GpuDnnConvGradI)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        if fd != (1, 1, 1):
+            raise SkipTest("Doesn't have CUDNN implementation")
+        mode = mode_with_gpu
+
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
    @classmethod
@@ -115,6 +166,28 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
                           target_op=GpuCorrMM_gradInputs,
                           filter_dilation=fd)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorrMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorrMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -148,6 +221,28 @@ class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
                           target_op=GpuCorr3dMM_gradInputs,
                           filter_dilation=fd)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorr3dMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorr3dMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -12,6 +12,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape

 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
@@ -628,56 +629,50 @@ class TestDnnInferShapes(utt.InferShapeTester):
                        [(1, 1, 1), (2, 2, 2)],
                        'none')

-    def _test_conv_gradw(self, img, kerns, out, img_val, kern_vals, border_mode, conv_mode, subsample):
+    def _test_conv_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)

+        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
+                                              border_mode, subsample)
+
        img_val = numpy.asarray(
-            img_val,
+            numpy.random.rand(*img_shape),
            dtype=theano.config.floatX
        )
-        kern_vals = numpy.asarray(
-            kern_vals,
+        topgrad_vals = numpy.asarray(
+            numpy.random.rand(*topgrad_shape),
            dtype=theano.config.floatX
        )

-        temp_img = img.dimshuffle(1, 0, 2, 3)
-        temp_kerns = kerns
-        if conv_mode == 'conv':
-            temp_kerns = temp_kerns[:, :, ::-1, ::-1]
-        temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
-        shape = (
-            kern_vals.shape[1], img_val.shape[1],
-            img_val.shape[2] - kern_vals.shape[2] + 1,
-            img_val.shape[3] - kern_vals.shape[3] + 1
-        )
-        out_vals = numpy.zeros(shape, dtype=theano.config.floatX)
+        kerns_vals = numpy.zeros(kerns_shape, dtype=theano.config.floatX)
+        kerns_shape = theano.shared(numpy.asarray(kerns_shape))
        desc = dnn.GpuDnnConvDesc(
            border_mode=border_mode,
            subsample=subsample,
            conv_mode=conv_mode,
            precision=set_precision(theano.config.floatX)
-        )(out.shape)
+        )(kerns_shape)
        conv_grad_w = dnn.GpuDnnConvGradW()(
-            temp_img,
-            temp_kerns,
-            out,
+            img,
+            topgrad,
+            kerns,
            desc,
        )
        self._compile_and_check(
-            [temp_img, temp_kerns, out],
+            [img, topgrad, kerns],
            [conv_grad_w],
-            [img_val, kern_vals, out_vals],
+            [img_val, topgrad_vals, kerns_vals],
            dnn.GpuDnnConvGradW
        )

    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
    def test_conv_gradw(self, border_mode, conv_mode):
        self._test_conv_gradw(T.tensor4('img'),
+                              T.tensor4('topgrad'),
                              T.tensor4('kerns'),
-                              T.tensor4('out'),
-                              numpy.random.rand(2, 5, 6, 8),
-                              numpy.random.rand(2, 1, 5, 6),
+                              (5, 2, 6, 13),
+                              (1, 2, 3, 7),
                              border_mode,
                              conv_mode,
                              (1, 1))

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
@@ -429,9 +429,17 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    const int dil_kW = (kW - 1) * dilW + 1;
    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
-    const int topWidth  = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
-    const int topDepth  = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -478,6 +486,19 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    if (direction == 0)
    { // forward pass
      output = top;
+      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+          cudaError_t err = cudaMemset(output->devdata, 0,
+                                       CudaNdarray_SIZE(output) * sizeof(real));
+          if (err != cudaSuccess) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM could not fill the output with zeros: %s",
+                         cudaGetErrorString(err));
+            Py_DECREF(col);
+            return NULL;
+          }
+          Py_DECREF(col);
+          return output;
+      }
      // valid correlation: im2col, then gemm
      // Iterate over batch
      for (int n = 0; n < batchSize; n++)
@@ -527,6 +548,19 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    {
      // backprop wrt. weights
      output = weight;
+      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+          cudaError_t err = cudaMemset(output->devdata, 0,
+                                       CudaNdarray_SIZE(output) * sizeof(real));
+          if (err != cudaSuccess) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %s",
+                         cudaGetErrorString(err));
+            Py_DECREF(col);
+            return NULL;
+          }
+          Py_DECREF(col);
+          return output;
+      }
      // valid convolution: im2col, then gemm
      // Iterate over batch
      for (int n = 0; n < batchSize; n++)
@@ -578,6 +612,19 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    {
      // backprop wrt. inputs
      output = bottom;
+      if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+          cudaError_t err = cudaMemset(output->devdata, 0,
+                                       CudaNdarray_SIZE(output) * sizeof(real));
+          if (err != cudaSuccess) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %s",
+                         cudaGetErrorString(err));
+            Py_DECREF(col);
+            return NULL;
+          }
+          Py_DECREF(col);
+          return output;
+      }
      // full convolution: gemm, then col2im3d
      // Iterate over batch
      for (int n = 0; n < batchSize; n++)

--- a/theano/sandbox/cuda/corr_gemm.cu
+++ b/theano/sandbox/cuda/corr_gemm.cu
@@ -333,8 +333,15 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -377,6 +384,19 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    CudaNdarray *output;
    if (direction == 0) {  // forward pass
        output = top;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            cudaError_t err = cudaMemset(output->devdata, 0,
+                                         CudaNdarray_SIZE(output) * sizeof(real));
+            if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM could not fill the output with zeros: %s",
+                             cudaGetErrorString(err));
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid correlation: im2col, then gemm
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {
@@ -445,6 +465,19 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            cudaError_t err = cudaMemset(output->devdata, 0,
+                                         CudaNdarray_SIZE(output) * sizeof(real));
+            if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %s",
+                             cudaGetErrorString(err));
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // valid convolution: im2col, then gemm
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {
@@ -513,6 +546,19 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
+        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+            cudaError_t err = cudaMemset(output->devdata, 0,
+                                         CudaNdarray_SIZE(output) * sizeof(real));
+            if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %s",
+                             cudaGetErrorString(err));
+                Py_DECREF(col);
+                return NULL;
+            }
+            Py_DECREF(col);
+            return output;
+        }
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -14,7 +14,8 @@ from theano.gof.type import CDataType
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
-from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
+                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -1132,10 +1133,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
-                              shape_i(img, 1, fgraph), shape2, shape3)
+        out_shp = (shape_i(kerns, 1, fgraph),
+                   shape_i(img, 1, fgraph),
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross', precision=precision)(img.shape,
                                                                      out.shape)
@@ -1149,10 +1152,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        img = gpu_contiguous(img)
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
-        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
-        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
-                              shape_i(kerns, 1, fgraph), shape2, shape3)
+        out_shp = (shape_i(img, 0, fgraph),
+                   shape_i(kerns, 1, fgraph),
+                   shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
+                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode, precision=precision)(out.shape,
                                                                        kerns.shape)
@@ -1170,6 +1175,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)

@@ -1248,11 +1254,13 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
            kerns = kerns[:, :, ::-1, ::-1, ::-1]
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
-        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
-        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
-        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
-                              shape_i(img, 1, fgraph), shape2, shape3, shape4)
+        out_shp = (shape_i(kerns, 1, fgraph),
+                   shape_i(img, 1, fgraph),
+                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
+                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1,
+                   shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
+        out_shp = assert_conv_shape(out_shp)
+        out = gpu_alloc_empty(*out_shp)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
                              conv_mode='cross', precision=precision)(img.shape,
                                                                      out.shape)
@@ -1271,6 +1279,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
                                         desc_op.border_mode,
                                         desc_op.subsample)
+    out_shp = assert_conv_shape(out_shp)
    out = gpu_alloc_empty(*out_shp)
    return GpuDnnConv3d(algo=algo)(img, kerns, out, desc)


--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -12,11 +12,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    return 1;
  }

-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
  int nb_dim = CudaNdarray_NDIM(input);

 #ifdef CONV_INPLACE
@@ -30,6 +25,22 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    return 1;
 #endif

+  if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
+    cudaError_t err2 = cudaMemset((*output)->devdata, 0,
+                                  CudaNdarray_SIZE(*output) * sizeof(real));
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv could not fill the output with zeros: %s",
+                   cudaGetErrorString(err2));
+      return 1;
+    }
+    return 0;
+  }
+
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;


--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -12,11 +12,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
  }

-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
-    return 1;
-
  int nb_dim = CudaNdarray_NDIM(output);

 #ifdef CONV_INPLACE
@@ -30,9 +25,64 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
 #endif

+  if (CudaNdarray_DIMS(im)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
+    cudaError_t err2 = cudaMemset((*input)->devdata, 0,
+                                  CudaNdarray_SIZE(*input) * sizeof(real));
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %s",
+                   cudaGetErrorString(err2));
+      return 1;
+    }
+    return 0;
+  }
+
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
+    return 1;
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              nb_dim, expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (nb_dim == 4) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
+      return 1;
+    }
+  } else if (nb_dim == 5) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
+        (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)expected_output_dims[4],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[4]);
+      return 1;
+    }
+  }
+
  {
    size_t worksize;
    void *workspace;

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -12,11 +12,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    return 1;
  }

-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
-    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
-
  int nb_dim = CudaNdarray_NDIM(output);

 #ifdef CONV_INPLACE
@@ -30,9 +25,64 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    return 1;
 #endif

+  if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(km)[0] == 0 || CudaNdarray_DIMS(km)[1] == 0) {
+    cudaError_t err2 = cudaMemset((*kerns)->devdata, 0,
+                                  CudaNdarray_SIZE(*kerns) * sizeof(real));
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv grad wrt. weights could not fill the output with zeros: %s",
+                   cudaGetErrorString(err2));
+      return 1;
+    }
+    return 0;
+  }
+
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+    return 1;
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+    return 1;
  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

+  int expected_output_dims[5] = {0};
+  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                              nb_dim, expected_output_dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (nb_dim == 4) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
+                                     " but received gradient with shape %ldx%ldx%dx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
+      return 1;
+    }
+  } else if (nb_dim == 5) {
+    if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
+        (CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
+        (CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
+        (CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
+        (CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                   (long int)expected_output_dims[0], (long int)expected_output_dims[1],
+                   (long int)expected_output_dims[2], (long int)expected_output_dims[3],
+                   (long int)expected_output_dims[4],
+                   (long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
+                   (long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
+                   (long int)CudaNdarray_HOST_DIMS(output)[4]);
+      return 1;
+    }
+  }
+
  {
    size_t worksize;
    void *workspace;

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -13,6 +13,7 @@ from theano.sandbox.cuda.blas import (
    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from nose.plugins.skip import SkipTest
+from nose.tools import assert_raises

 import theano.sandbox.cuda as cuda
 if not cuda.cuda_available:
@@ -57,6 +58,31 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_flip=flip, target_op=GpuDnnConvGradI,
                           filter_dilation=fd)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -91,6 +117,31 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
                           filter_flip=flip, target_op=GpuDnnConv3dGradI,
                           filter_dilation=fd)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        if fd != (1, 1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip, target_op=GpuDnnConvGradI,
+                               filter_dilation=fd)
+        else:
+            assert_raises((RuntimeError, ValueError),
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip, target_op=GpuDnnConvGradI,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
    @classmethod
@@ -124,6 +175,28 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
                           target_op=GpuCorrMM_gradInputs,
                           filter_dilation=fd)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorrMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorrMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
    @classmethod
@@ -157,6 +230,28 @@ class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
                           target_op=GpuCorr3dMM_gradInputs,
                           filter_dilation=fd)

+    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
+        mode = self.mode
+        if not expect_error:
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=GpuCorr3dMM_gradInputs,
+                               filter_dilation=fd)
+        else:
+            assert_raises(ValueError,
+                          self.run_gradinput,
+                          inputs_shape=i, filters_shape=f,
+                          output_shape=o, subsample=s,
+                          verify_grad=False, mode=mode,
+                          provide_shape=provide_shape, border_mode=b,
+                          filter_flip=flip,
+                          target_op=GpuCorr3dMM_gradInputs,
+                          ref=None,
+                          filter_dilation=fd)
+

 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -4,6 +4,7 @@ import os
 import sys

 from nose.plugins.skip import SkipTest
+from nose_parameterized import parameterized
 from itertools import chain, product
 import six.moves.cPickle as pickle
 from six import StringIO
@@ -16,6 +17,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -979,99 +981,105 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnConv3d
            )

-    def test_conv_gradw(self):
+    def _test_conv_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)
-        img = T.ftensor4('img')
-        kerns = T.ftensor4('kerns')
-        out = T.ftensor4('out')
+
+        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
+                                              border_mode, subsample)
+
        img_val = numpy.asarray(
-            numpy.random.rand(2, 5, 6, 8),
-            dtype='float32'
+            numpy.random.rand(*img_shape),
+            dtype=theano.config.floatX
        )
-        kern_vals = numpy.asarray(
-            numpy.random.rand(2, 1, 5, 6),
-            dtype='float32'
+        topgrad_vals = numpy.asarray(
+            numpy.random.rand(*topgrad_shape),
+            dtype=theano.config.floatX
        )

-        for params in product(
-            ['valid', 'full', 'half'],
-            [(1, 1)],  # strides besides (1, 1)
-            ['conv', 'cross']
-        ):
-            temp_img = img.dimshuffle(1, 0, 2, 3)
-            temp_kerns = kerns
-            if params[2] == 'conv':
-                temp_kerns = temp_kerns[:, :, ::-1, ::-1]
-            temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
-            shape = (
-                kern_vals.shape[1], img_val.shape[1],
-                img_val.shape[2] - kern_vals.shape[2] + 1,
-                img_val.shape[3] - kern_vals.shape[3] + 1
-            )
-            out_vals = numpy.zeros(shape, dtype='float32')
+        kerns_vals = numpy.zeros(kerns_shape, dtype=theano.config.floatX)
+        kerns_shape = theano.shared(numpy.asarray(kerns_shape))
+        topgrad_shape = theano.shared(numpy.asarray(topgrad_shape))
        desc = dnn.GpuDnnConvDesc(
-                border_mode=params[0],
-                subsample=params[1],
-                conv_mode=params[2]
-            )(temp_img.shape, out.shape)
+            border_mode=border_mode,
+            subsample=subsample,
+            conv_mode=conv_mode
+        )(topgrad_shape, kerns_shape)
        conv_grad_w = dnn.GpuDnnConvGradW()(
-                temp_img,
-                temp_kerns,
-                out,
+            img,
+            topgrad,
+            kerns,
            desc,
        )
        self._compile_and_check(
-                [temp_img, temp_kerns, out],
+            [img, topgrad, kerns],
            [conv_grad_w],
-                [img_val, kern_vals, out_vals],
+            [img_val, topgrad_vals, kerns_vals],
            dnn.GpuDnnConvGradW
        )

-    def test_conv3d_gradw(self):
+    border_modes = ['valid', 'full', 'half']
+    conv_modes = ['conv', 'cross']
+
+    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
+    def test_conv_gradw(self, border_mode, conv_mode):
+        self._test_conv_gradw(T.tensor4('img'),
+                              T.tensor4('topgrad'),
+                              T.tensor4('kerns'),
+                              (5, 2, 6, 13),
+                              (1, 2, 3, 7),
+                              border_mode,
+                              conv_mode,
+                              (1, 1))
+
+    def _test_conv3d_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
            raise SkipTest('"cuDNN 3D convolution requires cuDNN v2')
-        img = T.ftensor5('img')
-        kerns = T.ftensor5('kerns')
-        out = T.ftensor5('out')
+
+        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
+                                              border_mode, subsample)
+
        img_val = numpy.asarray(
-            numpy.random.rand(9, 2, 4, 8, 13),
-            dtype='float32'
+            numpy.random.rand(*img_shape),
+            dtype=theano.config.floatX
        )
-        kern_vals = numpy.asarray(
-            numpy.random.rand(11, 2, 3, 1, 4),
-            dtype='float32'
+        topgrad_vals = numpy.asarray(
+            numpy.random.rand(*topgrad_shape),
+            dtype=theano.config.floatX
        )

-        for params in product(
-            ['valid', 'full', 'half'],
-            [(1, 1, 1), (2, 2, 2)],
-            ['conv', 'cross']
-        ):
-            out_vals = numpy.zeros(
-                dnn.GpuDnnConv3d.get_out_shape(img_val.shape, kern_vals.shape,
-                                               border_mode=params[0],
-                                               subsample=params[1]),
-                dtype='float32')
-
+        kerns_vals = numpy.zeros(kerns_shape, dtype=theano.config.floatX)
+        kerns_shape = theano.shared(numpy.asarray(kerns_shape))
+        topgrad_shape = theano.shared(numpy.asarray(topgrad_shape))
        desc = dnn.GpuDnnConvDesc(
-                border_mode=params[0],
-                subsample=params[1],
-                conv_mode=params[2]
-            )(img.shape, out.shape)
+            border_mode=border_mode,
+            subsample=subsample,
+            conv_mode=conv_mode
+        )(topgrad_shape, kerns_shape)
        conv_grad_w = dnn.GpuDnnConv3dGradW()(
            img,
-                out,
+            topgrad,
            kerns,
            desc,
        )
        self._compile_and_check(
-                [img, out, kerns],
+            [img, topgrad, kerns],
            [conv_grad_w],
-                [img_val, out_vals, kern_vals],
+            [img_val, topgrad_vals, kerns_vals],
            dnn.GpuDnnConv3dGradW
        )

+    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
+    def test_conv3d_gradw(self, border_mode, conv_mode):
+        self._test_conv3d_gradw(T.tensor5('img'),
+                                T.tensor5('topgrad'),
+                                T.tensor5('kerns'),
+                                (5, 2, 6, 13, 21),
+                                (1, 2, 3, 7, 9),
+                                border_mode,
+                                conv_mode,
+                                (1, 1, 1))
+
    def test_conv_gradi(self):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -123,7 +123,7 @@ class BaseCorrMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (1, self.openmp, blas_header_version())
+        return (5, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -234,17 +234,17 @@ class BaseCorrMM(gof.OpenMPOp):
        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when border_mode="half", we cannot infer the weight size.
-        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-            if not height:
-                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
+        if height:
            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
        else:
+            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
            height = '-1'
-        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-            if not width:
-                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
+        if width:
            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
        else:
+            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
            width = '-1'
        sub = sub.copy()
        sub.update(locals())
@@ -268,15 +268,15 @@ class BaseCorrMM(gof.OpenMPOp):

    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW;
+    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyArray_DIMS(weights)[2];
        kW = PyArray_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
+        if (%(height)s != -1) {
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -287,7 +287,8 @@ class BaseCorrMM(gof.OpenMPOp):
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
+            // kernel width is specified (perhaps horizontal subsampling or half padding)
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -299,8 +300,8 @@ class BaseCorrMM(gof.OpenMPOp):
    }

    // Implicit dilated kernel size
-    int dil_kH = (kH - 1) * dilH + 1;
-    int dil_kW = (kW - 1) * dilW + 1;
+    dil_kH = (kH - 1) * dilH + 1;
+    dil_kW = (kW - 1) * dilW + 1;

    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
@@ -334,6 +335,21 @@ class BaseCorrMM(gof.OpenMPOp):
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "CorrMM: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3]);
+            %(fail)s
+        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
@@ -342,14 +358,44 @@ class BaseCorrMM(gof.OpenMPOp):
        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
        out_dim[2] = (npy_intp)kH;  // already inferred further above
        out_dim[3] = (npy_intp)kW;  // how convenient
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "CorrMM backprop wrt. weights: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3],
+                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "CorrMM backprop wrt. inputs: impossible output shape\\n"
+                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                         (long int)out_dim[3],
+                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+            %(fail)s
+        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
@@ -491,13 +537,13 @@ class CorrMM_gradWeights(BaseCorrMM):
            raise TypeError('img must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) or self.border_mode == "half":
        if shape is None:
+            if self.subsample != (1, 1) or self.border_mode == "half":
                raise ValueError('shape must be given if subsample != (1, 1)'
                                 ' or border_mode == "half"')
-            height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]
-        else:
            height_width = []
+        else:
+            height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]

        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
                         False, False]
@@ -588,9 +634,13 @@ class CorrMM_gradInputs(BaseCorrMM):
            raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
-        if self.subsample != (1, 1) and shape is None:
+        if shape is None:
+            if self.subsample != (1, 1):
                raise ValueError('shape must be given if subsample != (1, 1)')
-        height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')] if self.subsample != (1, 1) else []
+            height_width = []
+        else:
+            height_width = [as_tensor_variable(shape[0]).astype('int64'),
+                            as_tensor_variable(shape[1]).astype('int64')]

        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
                         False, False]

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
--- a/theano/tensor/nnet/corr3d_gemm.c
+++ b/theano/tensor/nnet/corr3d_gemm.c
@@ -188,9 +188,17 @@ PyArrayObject* corr3dMM(PyArrayObject* bottom,
    const int dil_kW = (kW - 1) * dilW + 1;
    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
-    const int topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||
@@ -245,7 +253,23 @@ PyArrayObject* corr3dMM(PyArrayObject* bottom,
    char Trans = 'T';
    PyArrayObject *output;

-    if (direction == 0) {  // forward pass
+    if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+        switch(direction) {
+        case 0:
+            output = top;
+            break;
+        case 1:
+            output = weight;
+            break;
+        case 2:
+            output = bottom;
+            break;
+        default:
+            return NULL;
+        }
+        PyArray_FILLWBYTE(output, 0);
+    }
+    else if (direction == 0) {  // forward pass
        output = top;
        // valid correlation: im3d2col, then gemm
        // Iterate over batch

--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
@@ -164,8 +164,15 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||
@@ -219,7 +226,23 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    char Trans = 'T';
    PyArrayObject *output;

-    if (direction == 0) {  // forward pass
+    if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
+        switch(direction) {
+        case 0:
+            output = top;
+            break;
+        case 1:
+            output = weight;
+            break;
+        case 2:
+            output = bottom;
+            break;
+        default:
+            return NULL;
+        }
+        PyArray_FILLWBYTE(output, 0);
+    }
+    else if (direction == 0) {  // forward pass
        output = top;
        // valid correlation: im2col, then gemm
        // Iterate over batch

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py