CorrMM should use height/width if it is given.

30dd8bdc · Gijs van Tulder · 676cf29d · 30dd8bdc · 30dd8bdc · 30dd8bdc
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -528,7 +528,7 @@ class BaseGpuCorrMM(CGpuKernelBase):

    def c_code_cache_version(self):
        # Raise this whenever modifying the code below.
-        return (3,)
+        return (4,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -650,8 +650,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
        kW = PyGpuArray_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
+        if (%(height)s != -1) {
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -662,7 +662,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
            // explicit padding, we can infer the kernel height
            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -671,15 +671,6 @@ class BaseGpuCorrMM(CGpuKernelBase):
        else {
            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((%(height)s != -1 && %(height)s != kH) ||
-            (%(width)s != -1 && %(width)s != kW))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorrMM: computed kernel shape %%lldx%%lld "
-                         "does not match given shape %%lldx%%lld",
-                         (long long)kH, (long long)kW, (long long)%(height)s, (long long)%(width)s);
-            %(fail)s
-        }
    }

    // Implicit dilated kernel size
@@ -738,20 +729,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        out_typecode = top->ga.typecode;
        out_context = top->context;
-        if ((%(height)s != -1 && %(height)s != out_dim[2]) ||
-            (%(width)s != -1 && %(width)s != out_dim[3]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorrMM: computed output shape %%lldx%%lld "
-                         "does not match given shape %%lldx%%lld",
-                         (long long)out_dim[2], (long long)out_dim[3],
-                         (long long)%(height)s, (long long)%(width)s);
-            %(fail)s
-        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
@@ -1105,7 +1086,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):

    def c_code_cache_version(self):
        # raise this whenever modifying the code below.
-        return (3,)
+        return (4,)

    def c_code_helper(self, bottom, weights, top, direction, sub,
                      height=None, width=None, depth=None):
@@ -1245,8 +1226,8 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        kD = PyGpuArray_DIMS(weights)[4];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
+        if (%(height)s != -1) {
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -1257,7 +1238,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
            // explicit padding, we can infer the kernel height
            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -1266,7 +1247,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        else {
            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((dD != 1) || (padD == -1)) {
+        if (%(depth)s != -1) {
            kD = %(depth)s;
        }
        else if (padD == -2) {
@@ -1275,17 +1256,6 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        else {
            kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
        }
-        if ((%(height)s != -1 && %(height)s != kH) ||
-            (%(width)s != -1 && %(width)s != kW) ||
-            (%(depth)s != -1 && %(depth)s != kD))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorr3dMM: computed kernel shape %%lldx%%lldx%%lld "
-                         "does not match given shape %%lldx%%lldx%%lld",
-                         (long long)kH, (long long)kW, (long long)kD,
-                         (long long)%(height)s, (long long)%(width)s, (long long)%(depth)s);
-            %(fail)s
-        }
    }

    // Implicit dilated kernel size
@@ -1357,22 +1327,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
        out_typecode = top->ga.typecode;
        out_context = top->context;
-        if ((%(height)s != -1 && %(height)s != out_dim[2]) ||
-            (%(width)s != -1 && %(width)s != out_dim[3]) ||
-            (%(depth)s != -1 && %(depth)s != out_dim[4]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorr3dMM: computed output shape %%lldx%%lldx%%lld "
-                         "does not match given shape %%lldx%%lldx%%lld",
-                         (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4],
-                         (long long)%(height)s, (long long)%(width)s, (long long)%(depth)s);
-            %(fail)s
-        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
@@ -425,9 +425,17 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    const size_t dil_kW = (kW - 1) * dilW + 1;
    const size_t dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
-    const size_t topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const size_t topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const size_t topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -360,8 +360,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -922,7 +922,7 @@ class BaseGpuCorrMM(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 27)
+        return (0, 28)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -1042,8 +1042,8 @@ class BaseGpuCorrMM(GpuOp):
        kW = CudaNdarray_HOST_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
+        if (%(height)s != -1) {
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -1054,7 +1054,7 @@ class BaseGpuCorrMM(GpuOp):
            // explicit padding, we can infer the kernel height
            kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -1063,15 +1063,6 @@ class BaseGpuCorrMM(GpuOp):
        else {
            kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((%(height)s != -1 && %(height)s != kH) ||
-            (%(width)s != -1 && %(width)s != kW))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorrMM: computed kernel shape %%dx%%d "
-                         "does not match given shape %%dx%%d",
-                         kH, kW, %(height)s, %(width)s);
-            %(fail)s
-        }
    }

    // Implicit dilated kernel size
@@ -1124,18 +1115,8 @@ class BaseGpuCorrMM(GpuOp):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
-        if ((%(height)s != -1 && %(height)s != out_dim[2]) ||
-            (%(width)s != -1 && %(width)s != out_dim[3]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorrMM: computed output shape %%dx%%d "
-                         "does not match given shape %%dx%%d",
-                         out_dim[2], out_dim[3],
-                         %(height)s, %(width)s);
-            %(fail)s
-        }
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
@@ -1500,7 +1481,7 @@ class BaseGpuCorr3dMM(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 26)
+        return (0, 27)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -1642,9 +1623,9 @@ class BaseGpuCorr3dMM(GpuOp):
    }
    else
    {
-      if ((dH != 1) || (padH == -1))
+      if (%(height)s != -1)
      {
-         // vertical subsampling or half padding, kernel height is specified
+         // kernel height is specified (perhaps vertical subsampling or half padding)
         kH = %(height)s;
      }
      else if (padH == -2)
@@ -1657,7 +1638,7 @@ class BaseGpuCorr3dMM(GpuOp):
        // explicit padding, we can infer the kernel height
        kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
      }
-      if ((dW != 1) || (padW == -1))
+      if (%(width)s != -1)
      {
        kW = %(width)s;
      }
@@ -1669,7 +1650,7 @@ class BaseGpuCorr3dMM(GpuOp):
      {
        kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
      }
-      if ((dD != 1) || (padD == -1))
+      if (%(depth)s != -1)
      {
        kD = %(depth)s;
      }
@@ -1681,16 +1662,6 @@ class BaseGpuCorr3dMM(GpuOp):
      {
        kD = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD+ 1;
      }
-      if ((%(height)s != -1 && %(height)s != kH) ||
-          (%(width)s != -1 && %(width)s != kW) ||
-          (%(depth)s != -1 && %(depth)s != kD))
-      {
-        PyErr_Format(PyExc_ValueError,
-                     "BaseGpuCorr3dMM: computed kernel shape %%dx%%dx%%d "
-                     "does not match given shape %%dx%%dx%%d",
-                     kH, kW, kD, %(height)s, %(width)s, %(depth)s);
-        %(fail)s
-      }
    }

    // Implicit dilated kernel size
@@ -1763,20 +1734,9 @@ class BaseGpuCorr3dMM(GpuOp):
        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
-        if ((%(height)s != -1 && %(height)s != out_dim[2]) ||
-            (%(width)s != -1 && %(width)s != out_dim[3]) ||
-            (%(depth)s != -1 && %(depth)s != out_dim[4]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseGpuCorr3dMM: computed output shape %%dx%%dx%%d "
-                         "does not match given shape %%dx%%dx%%d",
-                         out_dim[2], out_dim[3], out_dim[4],
-                         %(height)s, %(width)s, %(depth)s);
-            %(fail)s
-        }
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[4] = (%(depth)s != -1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");

--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
@@ -429,9 +429,17 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
    const int dil_kW = (kW - 1) * dilW + 1;
    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
-    const int topWidth  = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
-    const int topDepth  = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||

--- a/theano/sandbox/cuda/corr_gemm.cu
+++ b/theano/sandbox/cuda/corr_gemm.cu
@@ -333,8 +333,15 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -123,7 +123,7 @@ class BaseCorrMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (2, self.openmp, blas_header_version())
+        return (3, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -275,8 +275,8 @@ class BaseCorrMM(gof.OpenMPOp):
        kW = PyArray_DIMS(weights)[3];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
+        if (%(height)s != -1) {
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -287,7 +287,8 @@ class BaseCorrMM(gof.OpenMPOp):
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
+            // kernel width is specified (perhaps horizontal subsampling or half padding)
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -296,15 +297,6 @@ class BaseCorrMM(gof.OpenMPOp):
        else {
            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((%(height)s != -1 && %(height)s != kH) ||
-            (%(width)s != -1 && %(width)s != kW))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseCorrMM: computed kernel shape %%lldx%%lld "
-                         "does not match given shape %%lldx%%lld",
-                         (long long)kH, (long long)kW, (long long)%(height)s, (long long)%(width)s);
-            %(fail)s
-        }
    }

    // Implicit dilated kernel size
@@ -357,18 +349,8 @@ class BaseCorrMM(gof.OpenMPOp):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
-        if ((%(height)s != -1 && %(height)s != out_dim[2]) ||
-            (%(width)s != -1 && %(width)s != out_dim[3]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseCorrMM: computed output shape %%lldx%%lld "
-                         "does not match given shape %%lldx%%lld",
-                         (long long)out_dim[2], (long long)out_dim[3],
-                         (long long)%(height)s, (long long)%(width)s);
-            %(fail)s
-        }
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
@@ -123,7 +123,7 @@ class BaseCorr3dMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (2, self.openmp, blas_header_version())
+        return (3, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -292,8 +292,8 @@ class BaseCorr3dMM(gof.OpenMPOp):
        kD = PyArray_DIMS(weights)[4];
    }
    else {
-        if ((dH != 1) || (padH == -1)) {
-            // vertical subsampling or half padding, kernel height is specified
+        if (%(height)s != -1) {
+            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
@@ -304,7 +304,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
-        if ((dW != 1) || (padW == -1)) {
+        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
@@ -313,7 +313,7 @@ class BaseCorr3dMM(gof.OpenMPOp):
        else {
            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
-        if ((dD != 1) || (padD == -1)) {
+        if (%(depth)s != -1) {
            kD = %(depth)s;
        }
        else if (padD == -2) {
@@ -322,17 +322,6 @@ class BaseCorr3dMM(gof.OpenMPOp):
        else {
            kD = (PyArray_DIMS(bottom)[4] + 2*padD - (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
        }
-        if ((%(height)s != -1 && %(height)s != kH) ||
-            (%(width)s != -1 && %(width)s != kW) ||
-            (%(depth)s != -1 && %(depth)s != kD))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseCorr3dMM: computed kernel shape %%lldx%%lldx%%lld "
-                         "does not match given shape %%lldx%%lldx%%lld",
-                         (long long)kH, (long long)kW, (long long)kD,
-                         (long long)%(height)s, (long long)%(width)s, (long long)%(depth)s);
-            %(fail)s
-        }
    }

    // Implicit dilated kernel size
@@ -398,20 +387,9 @@ class BaseCorr3dMM(gof.OpenMPOp):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
-        out_dim[4] = (npy_intp)((dD != 1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
-        if ((%(height)s != -1 && %(height)s != out_dim[2]) ||
-            (%(width)s != -1 && %(width)s != out_dim[3]) ||
-            (%(depth)s != -1 && %(depth)s != out_dim[4]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "BaseCorr3dMM: computed output shape %%lldx%%lldx%%lld "
-                         "does not match given shape %%lldx%%lldx%%lld",
-                         (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4],
-                         (long long)%(height)s, (long long)%(width)s, (long long)%(depth)s);
-            %(fail)s
-        }
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[4] = (npy_intp)((%(depth)s != -1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n");

--- a/theano/tensor/nnet/corr3d_gemm.c
+++ b/theano/tensor/nnet/corr3d_gemm.c
@@ -188,9 +188,17 @@ PyArrayObject* corr3dMM(PyArrayObject* bottom,
    const int dil_kW = (kW - 1) * dilW + 1;
    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
-    const int topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||

--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
@@ -164,8 +164,15 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    // the above values might be negative so we need to use Python-like
+    // flooring integer division to be compatible with get_conv_output.
+    // note: this macro implements Python's // for negative x only
+#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
+    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
+    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
+#undef _CONV_FLOORDIV
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||