Remove deprecated AbstractConv Ops and tests (#1817)

* Remove depreciated AbstractConv Ops and tests * Remove tensor/conv from test CI * remove conv.rst

Remove deprecated AbstractConv Ops and tests (#1817)
a0ee9a44 · Jesse Grabowski · GitHub · 482e6cc2 · a0ee9a44 · 482e6cc2
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -84,10 +84,10 @@ jobs:
        part:
          - [ "*rest", "tests --ignore=tests/scan --ignore=tests/tensor --ignore=tests/xtensor --ignore=tests/link/numba" ]
          - [ "scan", "tests/scan" ]
-          - [ "tensor *rest", "tests/tensor --ignore=tests/tensor/test_basic.py --ignore=tests/tensor/test_elemwise.py --ignore=tests/tensor/test_math.py --ignore=tests/tensor/test_math_scipy.py --ignore=tests/tensor/test_blas.py --ignore=tests/tensor/signal --ignore=tests/tensor/conv --ignore=tests/tensor/rewriting --ignore=tests/tensor/linalg --ignore=tests/tensor/test_nlinalg.py --ignore=tests/tensor/test_slinalg.py --ignore=tests/tensor/test_pad.py" ]
+          - [ "tensor *rest", "tests/tensor --ignore=tests/tensor/test_basic.py --ignore=tests/tensor/test_elemwise.py --ignore=tests/tensor/test_math.py --ignore=tests/tensor/test_math_scipy.py --ignore=tests/tensor/test_blas.py --ignore=tests/tensor/signal --ignore=tests/tensor/rewriting --ignore=tests/tensor/linalg --ignore=tests/tensor/test_nlinalg.py --ignore=tests/tensor/test_slinalg.py --ignore=tests/tensor/test_pad.py" ]
          - [ "tensor basic+elemwise", "tests/tensor/test_basic.py tests/tensor/test_elemwise.py" ]
          - [ "tensor math", "tests/tensor/test_math.py" ]
-          - [ "tensor scipy+blas+conv+pad", "tests/tensor/test_math_scipy.py tests/tensor/test_blas.py tests/tensor/signal tests/tensor/conv tests/tensor/test_pad.py" ]
+          - [ "tensor scipy+blas+pad", "tests/tensor/test_math_scipy.py tests/tensor/test_blas.py tests/tensor/signal tests/tensor/test_pad.py" ]
          - [ "tensor rewriting", "tests/tensor/rewriting" ]
          - [ "tensor linalg", "tests/tensor/linalg tests/tensor/test_nlinalg.py tests/tensor/test_slinalg.py" ]
        exclude:

--- a/doc/library/tensor/conv.rst
+++ b/doc/library/tensor/conv.rst
-=========================================
-:mod:`tensor.conv` -- Tensor Convolutions
-=========================================
-
-.. module:: tensor.conv
-   :platform: Unix, Windows
-   :synopsis: Tensor Convolutions
-.. moduleauthor:: LISA, PyMC Developers, PyTensor Developers
-
-.. automodule:: pytensor.tensor.conv
-    :members:
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -270,15 +270,6 @@ def add_basic_configvars():
        in_c_key=False,
    )

-    config.add(
-        "conv__assert_shape",
-        "If True, AbstractConv* ops will verify that user-provided"
-        " shapes match the runtime shapes (debugging option,"
-        " may slow down compilation)",
-        BoolParam(False),
-        in_c_key=False,
-    )
-
    config.add(
        "print_global_stats",
        "Print some global statistics (time spent) at the end",

--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -71,7 +71,6 @@ class PyTensorConfigParser:
    pickle_test_value: bool
    cast_policy: str
    device: str
-    conv__assert_shape: bool
    print_global_stats: bool
    unpickle_function: bool
    # add_compile_configvars

--- a/pytensor/tensor/conv/__init__.py
+++ b/pytensor/tensor/conv/__init__.py
-from .abstract_conv import (
-    bilinear_upsampling,
-    causal_conv1d,
-    conv2d,
-    conv2d_transpose,
-    conv3d,
-    frac_bilinear_upsampling,
-    separable_conv2d,
-    separable_conv3d,
-)
--- a/pytensor/tensor/conv/abstract_conv.py
+++ b/pytensor/tensor/conv/abstract_conv.py
--- a/tests/tensor/conv/__init__.py
+++ b/tests/tensor/conv/__init__.py
--- a/tests/tensor/conv/c_code/corr3d_gemm.c
+++ b/tests/tensor/conv/c_code/corr3d_gemm.c
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp)
-// Loops for fast unfold + copy
-void im3d2col(const %(float_type)s* data_im, const int channels,
-    const int height, const int width, const int depth,
-    const int kernel_h, const int kernel_w, const int kernel_d,
-    const int dilation_h, const int dilation_w, const int dilation_d,
-    const int pad_h, const int pad_w, const int pad_d,
-    const int stride_h, const int stride_w, const int stride_d,
-    %(float_type)s* data_col) {
-  // Implicit dilated kernel size
-  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
-  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
-  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
-  int channels_col = channels * kernel_h * kernel_w * kernel_d;
-  for (int c = 0; c < channels_col; ++c) {
-    int d_offset = c %% kernel_d;
-    int w_offset = (c / kernel_d) %% kernel_w;
-    int h_offset = (c / kernel_w / kernel_d) %% kernel_h;
-    int c_im = c / kernel_h / kernel_w / kernel_d;
-    for (int h = 0; h < height_col; ++h) {
-      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-      for (int w = 0; w < width_col; ++w) {
-        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-        for (int d = 0; d < depth_col; ++d) {
-          int d_pad = d * stride_d - pad_d + d_offset * dilation_d;
-          if (h_pad >= 0 && h_pad < height
-              && w_pad >= 0 && w_pad < width
-              && d_pad >= 0 && d_pad < depth)
-            data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d] =
-              data_im[(npy_intp)((c_im * height + h_pad) * width + w_pad) * depth + d_pad];
-          else
-            data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d] = 0.;
-        }
-      }
-    }
-  }
-}
-
-// Unlike the Caffe and PyTensor GPU versions, the data_im array is set to zero
-// before the col2im call rather than doing it here. So, the result is just
-// accumulated into data_im.
-void col2im3d(const %(float_type)s* data_col, const int channels,
-    const int height, const int width, const int depth,
-    const int patch_h, const int patch_w, const int patch_d,
-    const int dilation_h, const int dilation_w, const int dilation_d,
-    const int pad_h, const int pad_w, const int pad_d,
-    const int stride_h, const int stride_w, const int stride_d,
-    %(float_type)s* data_im) {
-  // Implicit dilated patch
-  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
-  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  int dil_patch_d = (patch_d - 1) * dilation_d + 1;
-  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
-  int num_kernels = channels * height * width * depth;
-  int channels_col = channels * patch_h * patch_w * patch_d;
-  for (int c = 0; c < channels_col; ++c) {
-    int d_offset = c %% patch_d;
-    int w_offset = (c / patch_d) %% patch_w;
-    int h_offset = (c / patch_w / patch_d) %% patch_h;
-    int c_im = c / patch_h / patch_w / patch_d;
-    for (int h = 0; h < height_col; ++h) {
-      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
-      for (int w = 0; w < width_col; ++w) {
-        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
-        for (int d = 0; d < depth_col; ++d) {
-          int d_pad = d * stride_d - pad_d + d_offset * dilation_d;
-          if (h_pad >= 0 && h_pad < height
-              && w_pad >= 0 && w_pad < width
-              && d_pad >= 0 && d_pad < depth)
-            data_im[(npy_intp)((c_im * height + h_pad) * width + w_pad) * depth + d_pad] +=
-              data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d];
-        }
-      }
-    }
-  }
-}
-
-
-// PyTensor op code
-// GPU version authors: Arjun Jain, Frederic Bastien, Jan Schlueter
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-// CPU version author: Jesse Livezey
-// CPU version adapted from GPU version
-PyArrayObject* corr3dMM(PyArrayObject* bottom,
-                        PyArrayObject* weight,
-                        PyArrayObject* top,
-                        const int direction,
-                        const int dH = 1,
-                        const int dW = 1,
-                        const int dD = 1,
-                        const int dilH = 1,
-                        const int dilW = 1,
-                        const int dilD = 1,
-                        const int padH = 0,
-                        const int padW = 0,
-                        const int padD = 0,
-                        const int numgroups=1)
-{
-    if (PyArray_NDIM(bottom) != 5)
-    {
-        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires bottom of 5D");
-        return NULL;
-    }
-    if (PyArray_TYPE(bottom) != %(float_typenum)s)
-    {
-        PyErr_SetString(PyExc_ValueError, "Corr3dMM received bottom with wrong type.");
-        return NULL;
-    }
-
-    if (PyArray_NDIM(weight) != 5)
-    {
-        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires weight of 5D");
-        return NULL;
-    }
-    if (PyArray_TYPE(weight) != %(float_typenum)s)
-    {
-        PyErr_SetString(PyExc_ValueError, "Corr3dMM received weight with wrong type.");
-        return NULL;
-    }
-
-    if (PyArray_NDIM(top) != 5)
-    {
-        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires top of 5D");
-        return NULL;
-    }
-    if (PyArray_TYPE(top) != %(float_typenum)s)
-    {
-        PyErr_SetString(PyExc_ValueError, "Corr3dMM received top with wrong type.");
-        return NULL;
-    }
-    // Ensure data is contiguous
-    bottom = PyArray_GETCONTIGUOUS(bottom);
-    weight = PyArray_GETCONTIGUOUS(weight);
-    top = PyArray_GETCONTIGUOUS(top);
-
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
-    const int batchSize = PyArray_DIMS(bottom)[0];
-    const int nChannels = PyArray_DIMS(bottom)[1];
-    const int bottomHeight = PyArray_DIMS(bottom)[2];
-    const int bottomWidth = PyArray_DIMS(bottom)[3];
-    const int bottomDepth = PyArray_DIMS(bottom)[4];
-    // weights: (nFilters, nChannels, rows, columns, slices)
-    const int nFilters = PyArray_DIMS(weight)[0];
-    const int kH = PyArray_DIMS(weight)[2];
-    const int kW = PyArray_DIMS(weight)[3];
-    const int kD = PyArray_DIMS(weight)[4];
-    if (nChannels != PyArray_DIMS(weight)[1] * numgroups) {
-        PyErr_SetString(PyExc_ValueError,
-                "Corr3dMM images and kernel must have the same stack size\n");
-        return NULL;
-    }
-    if ((nFilters %% numgroups) != 0) {
-        PyErr_SetString(PyExc_ValueError,
-                "CorrMM the number of filters must be divisible by the number of groups\n");
-        return NULL;
-    }
-    // implicit dilated filter
-    const int dil_kH = (kH - 1) * dilH + 1;
-    const int dil_kW = (kW - 1) * dilW + 1;
-    const int dil_kD = (kD - 1) * dilD + 1;
-    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    const int topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
-    // the above values might be negative so we need to use Python-like
-    // flooring integer division to be compatible with get_conv_output.
-    // note: this macro implements Python's // for negative x only
-#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
-    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
-    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
-    const int topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
-#undef _CONV_FLOORDIV
-    if (batchSize != PyArray_DIMS(top)[0] ||
-            nFilters != PyArray_DIMS(top)[1] ||
-            topHeight != PyArray_DIMS(top)[2] ||
-            topWidth != PyArray_DIMS(top)[3] ||
-            topDepth != PyArray_DIMS(top)[4]) {
-        PyErr_Format(PyExc_ValueError,
-                "Corr3dMM shape inconsistency:\n"
-                "  bottom shape: %%d %%d %%d %%d %%d\n"
-                "  weight shape: %%d %%d %%d %%d %%d\n"
-                "  top shape: %%ld %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d %%d)\n",
-                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
-                nFilters, nChannels / numgroups, kH, kW, kD,
-                PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
-                PyArray_DIMS(top)[2], PyArray_DIMS(top)[3], PyArray_DIMS(top)[4],
-                batchSize, nFilters, topHeight, topWidth, topDepth);
-        return NULL;
-    }
-
-    // Create temporary columns
-    int max_threads = %(omp_get_max_threads)s;
-    if (batchSize < max_threads) {
-        max_threads = batchSize;
-    }
-    npy_intp col_dim[3];
-    col_dim[0] = (npy_intp)max_threads;
-    col_dim[1] = (npy_intp)(nChannels * kW * kH * kD);
-    col_dim[2] = (npy_intp)(topHeight * topWidth * topDepth);
-
-    //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
-    PyArrayObject* col = (PyArrayObject*)PyArray_ZEROS(3,
-            col_dim,
-            PyArray_TYPE(top),
-            0);
-    if (NULL == col) {
-        PyErr_Format(PyExc_RuntimeError,
-                "Corr3dMM failed to allocate working memory of"
-                " %%ld x %%ld x %%ld\n",
-                col_dim[0], col_dim[1], col_dim[2]);
-        return NULL;
-    }
-
-    // Define some useful variables
-    const int batch_bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
-    const int group_bottom_stride = (PyArray_STRIDES(bottom)[1] * nChannels / numgroups)/%(n_bytes)f;
-    const int batch_top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
-    const int group_top_stride = (PyArray_STRIDES(top)[1] * nFilters / numgroups)/%(n_bytes)f;
-    const int K_ = col_dim[1] / numgroups;
-    const int N_ = col_dim[2];
-    const int col_stride = (K_ * N_ * numgroups);
-    const int group_col_stride = (K_ * N_);
-    const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
-    const int M_ = nFilters / numgroups;
-    const %(c_float_type)s one = 1.0;
-    const %(c_float_type)s zero = 0.0;
-    char NTrans = 'N';
-    char Trans = 'T';
-    PyArrayObject *output;
-
-    if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-        switch(direction) {
-        case 0:
-            output = top;
-            break;
-        case 1:
-            output = weight;
-            break;
-        case 2:
-            output = bottom;
-            break;
-        default:
-            return NULL;
-        }
-        PyArray_FILLWBYTE(output, 0);
-    }
-    else if (direction == 0) {  // forward pass
-        output = top;
-        // valid correlation: im3d2col, then gemm
-        // Iterate over batch
-        int blas_threads_saved = %(blas_get_num_threads)s;
-        // Always forcing gemm to one thread when OpenMP is enabled for best and stable performance.
-        %(blas_set_num_threads)s(1);
-        %(omp_flags)s
-        for (int n = 0; n < batchSize; ++n) {
-            int tid = %(omp_get_thread_num)s;
-            // First, im3d2col
-            im3d2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
-                     nChannels, bottomHeight, bottomWidth, bottomDepth,
-                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
-                     (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-
-            for ( int g = 0; g < numgroups; ++g){
-                // Second, gemm
-                %(gemm)s(&NTrans, &NTrans,
-                         &N_, &M_, &K_,
-                         &one,
-                         (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride + g * group_col_stride, &N_,
-                         (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
-                         &zero,
-                         (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
-            }
-        }
-        // Restore to previous blas threads
-        %(blas_set_num_threads)s(blas_threads_saved);
-    }
-    else if (direction == 1) {  // backprop wrt. weights
-        output = weight;
-        npy_intp weight_dim[2];
-        weight_dim[0] = (npy_intp)max_threads;
-        weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
-        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
-                                   weight_dim, PyArray_TYPE(weight), 0);
-
-        if (NULL == local_weight)
-        {
-            PyErr_Format(PyExc_RuntimeError,
-                    "Corr3dMM failed to allocate weight memory of %%ld x %%ld\n",
-                    weight_dim[0], weight_dim[1]);
-            return NULL;
-        }
-
-        // valid convolution: im2col, then gemm
-        // Iterate over batch
-        int blas_threads_saved = %(blas_get_num_threads)s;
-        // Always forcing gemm to one thread when OpenMP is enabled for best and stable performance.
-        %(blas_set_num_threads)s(1);
-        // OMP for batch-level paralization
-        %(omp_flags)s
-        for (int n = 0; n < batchSize; ++n) {
-            int tid = %(omp_get_thread_num)s;
-            // First, im2col
-            im3d2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
-                     nChannels, bottomHeight, bottomWidth, bottomDepth,
-                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
-                     (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-
-            for ( int g = 0; g < numgroups; ++g){
-                // Second, gemm
-                // Note that we accumulate into weight. We do so by setting beta = 0
-                // for the first iteration and beta = 1 for subsequent ones. (This
-                // is faster than setting weight to all zeros before the loop.)
-                %(gemm)s(&Trans, &NTrans,
-                         &K_, &M_, &N_,
-                         &one,
-                         (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
-                         (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_,
-                         (n == 0) ? &zero : &one,
-                         (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride +
-                         tid * weight_dim[1], &K_);
-            }
-        }
-        // Restore to previous blas threads
-        %(blas_set_num_threads)s(blas_threads_saved);
-
-        //aggregate weights
-        memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s));
-        /*
-         * Put index "j" into outer loop to get the
-         * correct result when openmp is used.
-         */
-        %(omp_flags)s
-        for(int j = 0; j < weight_dim[1]; ++j){
-            for(int i = 0; i < max_threads; ++i){
-                ((%(float_type)s*)PyArray_DATA(weight))[j] +=
-                    *((%(float_type)s*)PyArray_DATA(local_weight) +
-                    i * weight_dim[1] + j);
-            }
-        }
-        Py_DECREF(local_weight);
-    }
-    else if (direction == 2) {  // backprop wrt. inputs
-        output = bottom;
-        // bottom is set to zero here rather than inside of col2im
-        PyArray_FILLWBYTE(bottom, 0);
-        // full convolution: gemm, then col2im3d
-        // Iterate over batch
-
-        int blas_threads_saved = %(blas_get_num_threads)s;
-        // Always forcing gemm to one thread when OpenMP is enabled for best and stable performance.
-        %(blas_set_num_threads)s(1);
-        %(omp_flags)s
-        for (int n = 0; n < batchSize; ++n) {
-
-            int tid = %(omp_get_thread_num)s;
-            for ( int g = 0; g < numgroups; ++g){
-                // gemm into columns
-                %(gemm)s(&NTrans, &Trans,
-                         &N_, &K_, &M_,
-                         &one,
-                         (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_,
-                         (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
-                         &zero,
-                         (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
-            }
-            // col2im back to the data
-            col2im3d((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels,
-                     bottomHeight, bottomWidth, bottomDepth,
-                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
-                     (%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride);
-        }
-        // Restore to previous blas threads
-        %(blas_set_num_threads)s(blas_threads_saved);
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-    // decref from contiguous check
-    Py_DECREF(bottom);
-    Py_DECREF(weight);
-    Py_DECREF(top);
-
-    // Note that we don't change the refcount of the output matrix here. Output
-    // (re)allocation and refcounting is done in BaseCorr3dMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
--- a/tests/tensor/conv/c_code/corr_gemm.c
+++ b/tests/tensor/conv/c_code/corr_gemm.c
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp)
-// Loops for fast unfold + copy
-void im2col(const %(float_type)s* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int dilation_h, const int dilation_w,
-    const int pad_hl, const int pad_hr, const int pad_wl, const int pad_wr,
-    const int stride_h, const int stride_w,
-    %(float_type)s* data_col) {
-  // Implicit dilated kernel size
-  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
-  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  int height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
-  int width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
-  int channels_col = channels * kernel_h * kernel_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c %% kernel_w;
-    int h_offset = (c / kernel_w) %% kernel_h;
-    int c_im = c / kernel_h / kernel_w;
-    for (int h = 0; h < height_col; ++h) {
-      int h_pad = h * stride_h - pad_hl + h_offset * dilation_h;
-      for (int w = 0; w < width_col; ++w) {
-        int w_pad = w * stride_w - pad_wl + w_offset * dilation_w;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_col[(npy_intp)(c * height_col + h) * width_col + w] =
-            data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad];
-        else
-          data_col[(npy_intp)(c * height_col + h) * width_col + w] = 0.;
-      }
-    }
-  }
-}
-
-// Unlike the Caffe and PyTensor GPU versions, the data_im array is set to zero
-// before the col2im call rather than doing it here. So, the result is just
-// accumulated into data_im.
-void col2im(const %(float_type)s* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int dilation_h, const int dilation_w,
-    const int pad_hl, const int pad_hr, const int pad_wl, const int pad_wr,
-    const int stride_h, const int stride_w,
-    %(float_type)s* data_im) {
-  // Implicit dilated patch
-  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
-  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  int height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
-  int width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
-  int num_kernels = channels * height * width;
-  int channels_col = channels * patch_h * patch_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c %% patch_w;
-    int h_offset = (c / patch_w) %% patch_h;
-    int c_im = c / patch_h / patch_w;
-    for (int h = 0; h < height_col; ++h) {
-      int h_pad = h * stride_h - pad_hl + h_offset * dilation_h;
-      for (int w = 0; w < width_col; ++w) {
-        int w_pad = w * stride_w - pad_wl + w_offset * dilation_w;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad] +=
-            data_col[(npy_intp)(c * height_col + h) * width_col + w];
-
-      }
-    }
-  }
-}
-
-
-// PyTensor op code
-// GPU version authors: Arjun Jain, Frederic Bastien, Jan Schlueter
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-// CPU version author: Jesse Livezey
-// CPU version adapted from GPU version
-PyArrayObject* corrMM(PyArrayObject* bottom,
-                      PyArrayObject* weight,
-                      PyArrayObject* top,
-                      const int direction,
-                      const int dH = 1,
-                      const int dW = 1,
-                      const int dilH = 1,
-                      const int dilW = 1,
-                      const int padH_l = 0,
-                      const int padH_r = 0,
-                      const int padW_l = 0,
-                      const int padW_r = 0,
-                      const int numgroups = 1,
-                      const int unshared = 0)
-{
-    if (PyArray_NDIM(bottom) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM requires bottom of 4D");
-        return NULL;
-    }
-    if (PyArray_TYPE(bottom) != %(float_typenum)s)
-    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM received bottom with wrong type.");
-        return NULL;
-    }
-
-    if (PyArray_NDIM(weight) != (unshared ? 6 : 4))
-    {
-        PyErr_Format(PyExc_ValueError, "CorrMM requires weight of %%dD", unshared ? 6 : 4);
-        return NULL;
-    }
-    if (PyArray_TYPE(weight) != %(float_typenum)s)
-    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM received weight with wrong type.");
-        return NULL;
-    }
-
-    if (PyArray_NDIM(top) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM requires top of 4D");
-        return NULL;
-    }
-    if (PyArray_TYPE(top) != %(float_typenum)s)
-    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM received top with wrong type.");
-        return NULL;
-    }
-    // Ensure data is contiguous
-    bottom = PyArray_GETCONTIGUOUS(bottom);
-    weight = PyArray_GETCONTIGUOUS(weight);
-    top = PyArray_GETCONTIGUOUS(top);
-
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
-    const int batchSize = PyArray_DIMS(bottom)[0];
-    const int nChannels = PyArray_DIMS(bottom)[1];
-    const int bottomHeight = PyArray_DIMS(bottom)[2];
-    const int bottomWidth = PyArray_DIMS(bottom)[3];
-    // normal weights: (nFilters, nChannels, rows, columns)
-    // unshared weights: (nFilters, topHeight, topWidth, nChannels, rows, columns)
-    const int nFilters = PyArray_DIMS(weight)[0];
-    const int kH = PyArray_DIMS(weight)[unshared ? 4 : 2];
-    const int kW = PyArray_DIMS(weight)[unshared ? 5 : 3];
-    if (nChannels != PyArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
-        PyErr_SetString(PyExc_ValueError,
-                "CorrMM images and kernel must have the same stack size\n");
-        return NULL;
-    }
-    if ((nFilters %% numgroups) != 0) {
-        PyErr_SetString(PyExc_ValueError,
-                "CorrMM the number of filters must be divisible by the number of groups\n");
-        return NULL;
-    }
-    // implicit dilated filter
-    const int dil_kH = (kH - 1) * dilH + 1;
-    const int dil_kW = (kW - 1) * dilW + 1;
-    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
-    const int topWidthNoDW  = (bottomWidth + padW_l + padW_r - dil_kW);
-    // the above values might be negative so we need to use Python-like
-    // flooring integer division to be compatible with get_conv_output.
-    // note: this macro implements Python's // for negative x only
-#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) %% y) == 0 ? 0 : 1)) : (x / y))
-    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
-    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
-#undef _CONV_FLOORDIV
-    if (unshared) {
-        if (topHeight != PyArray_DIMS(weight)[1] ||
-                topWidth != PyArray_DIMS(weight)[2]) {
-            PyErr_Format(PyExc_ValueError,
-                    "CorrMM regions in kernel must match output regions:\n"
-                    "  bottom shape: %%d %%d %%d %%d\n"
-                    "  weight shape: %%d %%ld %%ld %%d %%d %%d"
-                    " (expected %%d %%d %%d %%d %%d %%d)\n"
-                    "  top shape(calculated): %%d %%d %%d %%d\n",
-                    batchSize, nChannels, bottomHeight, bottomWidth,
-                    nFilters, PyArray_DIMS(weight)[1],
-                    PyArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
-                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
-                    batchSize, nFilters, topHeight, topWidth);
-            return NULL;
-        }
-        if (batchSize != PyArray_DIMS(top)[0] ||
-                nFilters != PyArray_DIMS(top)[1] ||
-                topHeight != PyArray_DIMS(top)[2] ||
-                topWidth != PyArray_DIMS(top)[3]) {
-            PyErr_Format(PyExc_ValueError,
-                    "CorrMM shape inconsistency:\n"
-                    "  bottom shape: %%d %%d %%d %%d\n"
-                    "  weight shape: %%d %%d %%d %%d %%d %%d\n"
-                    "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
-                    batchSize, nChannels, bottomHeight, bottomWidth,
-                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
-                    PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
-                    PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
-                    batchSize, nFilters, topHeight, topWidth);
-            return NULL;
-        }
-    }
-    else {
-        if (batchSize != PyArray_DIMS(top)[0] ||
-                nFilters != PyArray_DIMS(top)[1] ||
-                topHeight != PyArray_DIMS(top)[2] ||
-                topWidth != PyArray_DIMS(top)[3]) {
-            PyErr_Format(PyExc_ValueError,
-                    "CorrMM shape inconsistency:\n"
-                    "  bottom shape: %%d %%d %%d %%d\n"
-                    "  weight shape: %%d %%d %%d %%d\n"
-                    "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
-                    batchSize, nChannels, bottomHeight, bottomWidth,
-                    nFilters, nChannels / numgroups, kH, kW,
-                    PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
-                    PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
-                    batchSize, nFilters, topHeight, topWidth);
-            return NULL;
-        }
-    }
-
-    // Create temporary columns
-    int max_threads = %(omp_get_max_threads)s;
-    if (batchSize < max_threads) {
-        max_threads = batchSize;
-    }
-    npy_intp col_dim[3];
-    col_dim[0] = (npy_intp)max_threads;
-    col_dim[1] = (npy_intp)(nChannels * kW * kH);
-    col_dim[2] = (npy_intp)(topHeight * topWidth);
-
-    //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
-    PyArrayObject* col = (PyArrayObject*)PyArray_ZEROS(3,
-            col_dim,
-            PyArray_TYPE(top),
-            0);
-    if (NULL == col) {
-        PyErr_Format(PyExc_RuntimeError,
-                "CorrMM failed to allocate working memory of"
-                " %%ld x %%ld x %%ld\n",
-                col_dim[0], col_dim[1], col_dim[2]);
-        return NULL;
-    }
-
-    // Define some useful variables
-    const int batch_bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
-    const int group_bottom_stride = (PyArray_STRIDES(bottom)[1] * nChannels / numgroups)/%(n_bytes)f;
-    const int batch_top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
-    const int group_top_stride = (PyArray_STRIDES(top)[1] * nFilters / numgroups)/%(n_bytes)f;
-    const int K_ = col_dim[1] / numgroups;
-    const int N_ = col_dim[2];
-    const int col_stride = (K_ * N_ * numgroups);
-    const int group_col_stride = (K_ * N_);
-    const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
-    const int M_ = nFilters / numgroups;
-    const int one_int = 1;
-    const %(c_float_type)s one = 1.0;
-    const %(c_float_type)s zero = 0.0;
-    const int ldw = (K_ * N_);
-    char NTrans = 'N';
-    char Trans = 'T';
-    PyArrayObject *output;
-
-    if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-        switch(direction) {
-        case 0:
-            output = top;
-            break;
-        case 1:
-            output = weight;
-            break;
-        case 2:
-            output = bottom;
-            break;
-        default:
-            return NULL;
-        }
-        PyArray_FILLWBYTE(output, 0);
-    }
-    else if (direction == 0) {  // forward pass
-        output = top;
-        // valid correlation: im2col, then gemm
-        // Iterate over batch
-        int blas_threads_saved = %(blas_get_num_threads)s;
-        // Always forcing gemm to one thread when OpenMP is enabled for best and stable performance.
-        %(blas_set_num_threads)s(1);
-        %(omp_flags)s
-        for (int n = 0; n < batchSize; ++n) {
-            int tid = %(omp_get_thread_num)s;
-            // First, im2col
-            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
-                   bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r, dH, dW,
-                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            // Second, gemm
-            if (unshared) {
-                for (int g = 0; g < numgroups; ++g) {
-                    for (int reg = 0; reg < N_; ++reg) {
-                        %(gemv)s(&Trans, &K_, &M_,
-                                &one,
-                                (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
-                                (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
-                                &zero,
-                                (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride + reg, &N_);
-                    }
-                }
-            }
-            else {
-                for ( int g = 0; g < numgroups; ++g){
-                    // Second, gemm
-                    %(gemm)s(&NTrans, &NTrans,
-                           &N_, &M_, &K_,
-                           &one,
-                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
-                           (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
-                           &zero,
-                           (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
-                }
-            }
-        }
-        // Restore to previous blas threads
-        %(blas_set_num_threads)s(blas_threads_saved);
-
-        /*
-        // Original caffe code for comparison
-        // Note that this code was translated from the PyTensor GPU code,
-        // not the Caffe CPU code.
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups here,
-        // but the group-related offsets help explain what M_, N_ and K_ are
-        int weight_offset = M_ * K_;
-        int col_offset = K_ * N_;
-        int top_offset = M_ * N_;
-        for (int n = 0; n < num_; ++n) {
-          // First, im2col
-          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-              width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-              col_data);
-          // Second, innerproduct with groups
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-              (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
-              (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g);
-            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-            cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_N,
-              N_, M_, K_,
-              1.,
-              col_data + col_offset * g, N_,
-              weight + weight_offset * g, K_,
-              0.,
-              top_data + (*top)[i]->offset(n) + top_offset * g, N_);
-          }
-        }
-        */
-    }
-    else if (direction == 1) {  // backprop wrt. weights
-        output = weight;
-        npy_intp weight_dim[2];
-        weight_dim[0] = (npy_intp)max_threads;
-        if (unshared)
-            weight_dim[1] = (npy_intp)(M_ * N_ * K_ * numgroups);
-        else
-            weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
-        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
-                                   weight_dim, PyArray_TYPE(weight), 0);
-
-        if (NULL == local_weight)
-        {
-            PyErr_Format(PyExc_RuntimeError,
-                    "CorrMM failed to allocate weight memory of %%ld x %%ld\n",
-                    weight_dim[0], weight_dim[1]);
-            return NULL;
-        }
-
-        // valid convolution: im2col, then gemm
-        // Iterate over batch
-        int blas_threads_saved = %(blas_get_num_threads)s;
-        // Always forcing gemm to one thread when OpenMP is enabled for best and stable performance.
-        %(blas_set_num_threads)s(1);
-        // OMP for batch-level paralization
-        %(omp_flags)s
-        for (int n = 0; n < batchSize; ++n) {
-            int tid = %(omp_get_thread_num)s;
-            // First, im2col
-            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
-                   nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r, dH, dW,
-                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            // Second, gemm
-            // Note that we accumulate into weight. We do so by setting beta = 0
-            // for the first iteration and beta = 1 for subsequent ones. (This
-            // is faster than setting weight to all zeros before the loop.)
-            if (unshared) {
-                for (int g = 0; g < numgroups; ++g) {
-                    for (int reg = 0; reg < N_; ++reg) {
-                        %(gemm)s(&Trans, &NTrans,
-                               &K_, &M_, &one_int,
-                               &one,
-                               (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
-                               (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
-                               (n == 0) ? &zero : &one,
-                               (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + reg * K_ +
-                               tid * weight_dim[1], &ldw);
-                    }
-                }
-            }
-            else {
-                for(int g = 0; g < numgroups; ++g){
-                    // Second, gemm
-                    // Note that we accumulate into weight. We do so by setting beta = 0
-                    // for the first iteration and beta = 1 for subsequent ones. (This
-                    // is faster than setting weight to all zeros before the loop.)
-                    %(gemm)s(&Trans, &NTrans,
-                           &K_, &M_, &N_,
-                           &one,
-                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
-                           (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride  + n * batch_top_stride, &N_,
-                           (n == 0) ? &zero : &one,
-                           (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride +
-                           tid * weight_dim[1], &K_);
-                }
-            }
-        }
-        // Restore to previous blas threads
-        %(blas_set_num_threads)s(blas_threads_saved);
-
-        //aggregate weights
-        memset((%(float_type)s*)PyArray_DATA(weight), 0, weight_dim[1]*sizeof(%(float_type)s));
-        /*
-         * Put index "j" into outer loop to get the
-         * correct result when openmp is used.
-         */
-        %(omp_flags)s
-        for(int j = 0; j < weight_dim[1]; ++j){
-            for(int i = 0; i < max_threads; ++i){
-                ((%(float_type)s*)PyArray_DATA(weight))[j] +=
-                    *((%(float_type)s*)PyArray_DATA(local_weight) +
-                    i * weight_dim[1] + j);
-            }
-        }
-        Py_DECREF(local_weight);
-        /*
-        // Original caffe code for comparison
-        // Note that this code was translated from the PyTensor GPU code,
-        // not the Caffe CPU code.
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups
-        for (int n = 0; n < num_; ++n) {
-          // Since we saved memory in the forward pass by not storing all col
-          // data, we will need to recompute them.
-          im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
-                     width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                     stride_h_, stride_w_, col_data);
-          // gradient w.r.t. weight. Note that we will accumulate diffs.
-          for (int g = 0; g < group_; ++g) {
-            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_data + col_offset * g, (Dtype)1.,
-                weight_diff + weight_offset * g);
-            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-            cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
-                1.0,
-                col_data + col_offset * g, N_,
-                top_diff + top[i]->offset(n) + top_offset * g, N_,
-                1.0,
-                weight_diff + weight_offset * g, K_);
-          }
-        }
-        */
-    }
-    else if (direction == 2) {  // backprop wrt. inputs
-        output = bottom;
-        // bottom is set to zero here rather than inside of col2im
-        PyArray_FILLWBYTE(bottom, 0);
-        // full convolution: gemm, then col2im
-        // Iterate over batch
-
-        int blas_threads_saved = %(blas_get_num_threads)s;
-        // Always forcing gemm to one thread when OpenMP is enabled for best and stable performance.
-        %(blas_set_num_threads)s(1);
-        %(omp_flags)s
-        for (int n = 0; n < batchSize; ++n) {
-            int tid = %(omp_get_thread_num)s;
-            if (unshared) {
-                for (int g = 0; g < numgroups; ++g){
-                    for (int reg = 0; reg < N_; ++reg){
-                        %(gemm)s(&NTrans, &Trans,
-                               &one_int, &K_, &M_,
-                               &one,
-                               (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
-                               (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
-                               &zero,
-                               (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_);
-                    }
-                }
-            }
-            else {
-                for (int g = 0; g < numgroups; ++g) {
-                    %(gemm)s(&NTrans, &Trans,
-                           &N_, &K_, &M_,
-                           &one,
-                           (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
-                           (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
-                           &zero,
-                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
-                }
-            }
-            // col2im back to the data
-            col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
-                   kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
-                   dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride);
-        }
-        // Restore to previous blas threads
-        %(blas_set_num_threads)s(blas_threads_saved);
-        /*
-        // Original caffe code for comparison
-        // Note that this code was translated from the PyTensor GPU code,
-        // not the Caffe CPU code.
-        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        for (int n = 0; n < num_; ++n) {
-          // gradient w.r.t. bottom data, if necessary
-          if (propagate_down[i]) {
-            for (int g = 0; g < group_; ++g) {
-              caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-                  (Dtype)1., weight + weight_offset * g,
-                  top_diff + top[i]->offset(n) + top_offset * g,
-                  (Dtype)0., col_diff + col_offset * g);
-              == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
-              cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_T, N_, K_, M_,
-                  1.,
-                  top_diff + top[i]->offset(n) + top_offset * g, N_,
-                  weight + weight_offset * g, K_,
-                  0.,
-                  col_diff + col_offset * g, N_);
-            }
-            // col2im back to the data
-            col2im_gpu(col_diff, channels_, height_, width_,
-                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-                bottom_diff + (*bottom)[i]->offset(n));
-          }
-        }
-        */
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-    // decref from contiguous check
-    Py_DECREF(bottom);
-    Py_DECREF(weight);
-    Py_DECREF(top);
-
-    // Note that we don't change the refcount of the output matrix here. Output
-    // (re)allocation and refcounting is done in BaseCorrMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
--- a/tests/tensor/conv/c_conv3d_corr3d_ref.py
+++ b/tests/tensor/conv/c_conv3d_corr3d_ref.py
-import logging
-from pathlib import Path
-
-import pytensor
-from pytensor.configdefaults import config
-from pytensor.graph.basic import Apply
-from pytensor.graph.op import _NoPythonOp
-from pytensor.link.c.op import OpenMPOp
-from pytensor.link.c.params_type import ParamsType
-from pytensor.link.c.type import EnumList
-from pytensor.scalar import int64
-from pytensor.tensor import blas_headers
-from pytensor.tensor.basic import as_tensor_variable
-from pytensor.tensor.blas import blas_header_version, ldflags
-from pytensor.tensor.conv.abstract_conv import get_conv_output_shape
-from pytensor.tensor.type import TensorType
-
-
-_logger = logging.getLogger(__name__)
-
-
-C_CODE_PATH = Path(__file__).parent / "c_code"
-
-
-class BaseCorr3dMM(OpenMPOp, _NoPythonOp):
-    """
-    Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and
-    `Corr3dMM_gradInputs`. Cannot be used directly.
-
-    Every sub-class must define internal attribute ``_direction`` out of __init__().
-    ``_direction`` must take one of following values:
-
-     - "forward" to correlate bottom with weights and store results in top.
-     - "backprop weights" to do a valid convolution of bottom with top
-       (swapping the first two dimensions) and store results in weights.
-     - "backprop inputs" to do a full convolution of top with weights
-       (swapping the first two dimensions) and store results in bottom.
-
-    Parameters
-    ----------
-    border_mode : {'valid', 'full', 'half'}
-        Additionally, the padding size could be directly specified by an integer
-        or a tuple of three of integers
-    subsample
-        Perform subsampling of the output (default: (1, 1, 1)).
-    filter_dilation
-        Perform dilated correlation (default: (1, 1, 1))
-    num_groups
-        Perform grouped convolutions (default: 1)
-    """
-
-    check_broadcast = False
-    __props__ = ("border_mode", "subsample", "filter_dilation", "num_groups")
-
-    _direction: str | None = None
-
-    params_type = ParamsType(
-        direction=EnumList(
-            ("DIRECTION_FORWARD", "forward"),  # 0
-            ("DIRECTION_BACKPROP_WEIGHTS", "backprop weights"),  # 1
-            ("DIRECTION_BACKPROP_INPUTS", "backprop inputs"),
-        ),  # 2
-        dH=int64,
-        dW=int64,
-        dD=int64,
-        dilH=int64,
-        dilW=int64,
-        dilD=int64,
-        padH=int64,
-        padW=int64,
-        padD=int64,
-        num_groups=int64,
-    )
-
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        filter_dilation=(1, 1, 1),
-        openmp=None,
-        num_groups=1,
-    ):
-        super().__init__(openmp=openmp)
-        if isinstance(border_mode, int):
-            if border_mode < 0:
-                raise ValueError(
-                    f"invalid border_mode {border_mode}, which must be a "
-                    "non-negative integer"
-                )
-            border_mode = (border_mode, border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            if len(border_mode) != 3 or min(border_mode) < 0:
-                raise ValueError(
-                    f"invalid border_mode {border_mode}, which must be a tuple of "
-                    "three non-negative integers"
-                )
-            pad_h, pad_w, pad_d = map(int, border_mode)
-            border_mode = (pad_h, pad_w, pad_d)
-        if not (
-            (isinstance(border_mode, tuple) and min(border_mode) >= 0)
-            or border_mode in ("valid", "full", "half")
-        ):
-            raise ValueError(
-                f"invalid border_mode {border_mode}, which must be either "
-                '"valid", "full", "half", an integer or a tuple of three'
-                " integers"
-            )
-        self.border_mode = border_mode
-        if len(subsample) != 3:
-            raise ValueError("subsample must have three elements")
-        if len(filter_dilation) != 3:
-            raise ValueError("filter_dilation must have three elements")
-        self.subsample = tuple(subsample)
-        self.filter_dilation = tuple(filter_dilation)
-        if num_groups < 1:
-            raise ValueError("Number of groups should be greater than 0")
-        self.num_groups = num_groups
-
-        if not config.blas__ldflags:
-            # PyTensor will use a NumPy C implementation of [sd]gemm_ instead.
-            self.blas_type = ""
-        else:
-            if "openblas" in config.blas__ldflags:
-                self.blas_type = "openblas"
-            elif "mkl" in config.blas__ldflags:
-                self.blas_type = "mkl"
-            else:
-                self.blas_type = ""
-
-        if self._direction not in ("forward", "backprop weights", "backprop inputs"):
-            raise ValueError(
-                "_direction must be one of 'forward', "
-                "'backprop weights', 'backprop inputs'"
-            )
-
-    @property
-    def pad(self):
-        if self.border_mode == "half":
-            return (-1, -1, -1)
-        elif self.border_mode == "full":
-            return (-2, -2, -2)
-        elif isinstance(self.border_mode, tuple):
-            return self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            return (0, 0, 0)
-
-    # Direction should be converted to real enum value,
-    # as it is compared to integer later in c_code_helper().
-    direction = property(lambda self: self.params_type.enum_from_alias(self._direction))
-
-    dH = property(lambda self: self.subsample[0])
-    dW = property(lambda self: self.subsample[1])
-    dD = property(lambda self: self.subsample[2])
-
-    dilH = property(lambda self: self.filter_dilation[0])
-    dilW = property(lambda self: self.filter_dilation[1])
-    dilD = property(lambda self: self.filter_dilation[2])
-
-    padH = property(lambda self: self.pad[0])
-    padW = property(lambda self: self.pad[1])
-    padD = property(lambda self: self.pad[2])
-
-    def __str__(self):
-        return f"{self.__class__.__name__}{{{self.border_mode}, {self.subsample!s}, {self.filter_dilation!s}, {self.num_groups!s}}}"
-
-    @staticmethod
-    def as_common_dtype(in1, in2):
-        """
-        Upcast input variables if necessary.
-        """
-        dtype = pytensor.scalar.upcast(in1.dtype, in2.dtype)
-        return in1.astype(dtype), in2.astype(dtype)
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "num_groups"):
-            self.num_groups = 1
-
-    def c_support_code(self, **kwargs):
-        ccodes = blas_headers.blas_header_text()
-        if self.blas_type == "openblas":
-            ccodes += blas_headers.openblas_threads_text()
-        elif self.blas_type == "mkl":
-            ccodes += blas_headers.mkl_threads_text()
-        return ccodes
-
-    def c_libraries(self, **kwargs):
-        return ldflags()
-
-    def c_compile_args(self, **kwargs):
-        compile_args = ldflags(libs=False, flags=True)
-        compile_args += super().c_compile_args(**kwargs)
-        return compile_args
-
-    def c_lib_dirs(self, **kwargs):
-        return ldflags(libs=False, libs_dir=True)
-
-    def c_header_dirs(self, **kwargs):
-        return ldflags(libs=False, include_dir=True)
-
-    def c_headers(self, **kwargs):
-        headers = ["<stdio.h>"]
-        headers += super().c_headers(**kwargs)
-        return headers
-
-    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (8, self.openmp, blas_header_version())
-
-    def c_support_code_apply(self, node, nodename):
-        # REMEMBER TO RAISE c_code_cache_version when changing any of
-        # these files
-        sub = {}
-        dtype = str(node.__dict__["inputs"][0].dtype)
-        assert dtype in ("float32", "float64")
-        if dtype == "float32":
-            sub["gemm"] = "sgemm_"
-            sub["float_type"] = "npy_float"
-            sub["float_typenum"] = "NPY_FLOAT"
-            sub["n_bytes"] = 4
-            sub["c_float_type"] = "float"
-        else:
-            sub["gemm"] = "dgemm_"
-            sub["float_type"] = "npy_double"
-            sub["float_typenum"] = "NPY_DOUBLE"
-            sub["n_bytes"] = 8
-            sub["c_float_type"] = "double"
-
-        if self.openmp:
-            sub["omp_flags"] = "#pragma omp parallel for schedule(static)"
-            sub["omp_get_max_threads"] = "omp_get_max_threads()"
-            sub["omp_get_thread_num"] = "omp_get_thread_num()"
-
-            if self.blas_type == "openblas":
-                sub["blas_set_num_threads"] = "openblas_set_num_threads"
-                sub["blas_get_num_threads"] = "openblas_get_num_threads()"
-            elif self.blas_type == "mkl":
-                sub["blas_set_num_threads"] = "mkl_set_num_threads"
-                sub["blas_get_num_threads"] = "mkl_get_max_threads()"
-            else:
-                sub["blas_set_num_threads"] = ""
-                sub["blas_get_num_threads"] = "0"
-        else:
-            sub["omp_flags"] = ""
-            sub["omp_get_max_threads"] = "1"
-            sub["omp_get_thread_num"] = "0"
-            sub["blas_set_num_threads"] = ""
-            sub["blas_get_num_threads"] = "0"
-
-        final_code = Path(C_CODE_PATH / "corr3d_gemm.c").read_text("utf-8")
-        return final_code % sub
-
-    def c_code_helper(
-        self, bottom, weights, top, sub, height=None, width=None, depth=None
-    ):
-        """
-        This generates the C code for Corr3dMM (direction="forward"),
-        Corr3dMM_gradWeights (direction="backprop weights"), and
-        Corr3dMM_gradInputs (direction="backprop inputs").
-        Depending on the direction, one of bottom, weights, top will
-        receive the output, while the other two serve as inputs.
-
-        :param bottom: Variable name of the input images in the forward pass,
-            or the gradient of the input images in backprop wrt. inputs
-        :param weights: Variable name of the filters in the forward pass,
-            or the gradient of the filters in backprop wrt. weights
-        :param top: Variable name of the output images / feature maps in the
-            forward pass, or the gradient of the outputs in the backprop passes
-        :param sub: Dictionary of substitutions usable to help generating the
-            C code.
-        :param height: If self.subsample[0] != 1, a variable giving the height
-            of the filters for direction="backprop weights" or the height of
-            the input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the height of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        :param width: If self.subsample[1] != 1, a variable giving the width
-            of the filters for direction="backprop weights" or the width of the
-            input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the width of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        :param depth: If self.subsample[1] != 1, a variable giving the depth
-            of the filters for direction="backprop weights" or the depth of the
-            input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the depth of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        """
-
-        # When subsampling, we cannot unambiguously infer the height and width
-        # of bottom and weights from top, so we require them to be given.
-        # Similarly, when border_mode="half", we cannot infer the weight size.
-        if height:
-            height = f"(*(npy_int64 *)(PyArray_DATA({height})))"
-        else:
-            if ((self.direction != 0) and (self.dH != 1)) or (
-                (self.direction == 1) and (self.padH == -1)
-            ):
-                raise ValueError(
-                    "height must be given for backprop with vertical sampling or border_mode='half'"
-                )
-            height = "-1"
-        if width:
-            width = f"(*(npy_int64 *)(PyArray_DATA({width})))"
-        else:
-            if ((self.direction != 0) and (self.dW != 1)) or (
-                (self.direction == 1) and (self.padW == -1)
-            ):
-                raise ValueError(
-                    "width must be given for backprop with horizontal sampling or border_mode='half'"
-                )
-            width = "-1"
-        if depth:
-            depth = f"(*(npy_int64 *)(PyArray_DATA({depth})))"
-        else:
-            if ((self.direction != 0) and (self.dD != 1)) or (
-                (self.direction == 1) and (self.padD == -1)
-            ):
-                raise ValueError(
-                    "depth must be given for backprop with depth sampling or border_mode='half'"
-                )
-            depth = "-1"
-
-        fail = sub["fail"]
-        params = sub["params"]
-        return f"""
-    // Mandatory args
-    int direction = {params}->direction;  // forward, bprop weights, bprop inputs
-
-    // Optional args
-    int dH = {params}->dH;
-    int dW = {params}->dW;
-    int dD = {params}->dD;
-    int dilH = {params}->dilH;
-    int dilW = {params}->dilW;
-    int dilD = {params}->dilD;
-    int padH = {params}->padH;
-    int padW = {params}->padW;
-    int padD = {params}->padD;
-    int numgroups = {params}->num_groups;
-
-    PyArrayObject * bottom = {bottom};
-    PyArrayObject * weights = {weights};
-    PyArrayObject * top = {top};
-    PyArrayObject * out2 = NULL;
-    PyArrayObject **out = NULL;
-
-    switch({params}->direction) {{
-        case DIRECTION_FORWARD:
-            out = &{top};
-            break;
-        case DIRECTION_BACKPROP_WEIGHTS:
-            out = &{weights};
-            break;
-        case DIRECTION_BACKPROP_INPUTS:
-            out = &{bottom};
-            break;
-        default:
-            PyErr_SetString(PyExc_ValueError, "CPU Corr3dMM: Invalid direction.");
-            {{{fail}}}
-            break;
-    }}
-
-    // Obtain or infer kernel width, height and depth
-    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW, kD, dil_kH, dil_kW, dil_kD;
-    if (direction != 1) {{
-        // weight is an input variable, we can just read its shape
-        kH = PyArray_DIMS(weights)[2];
-        kW = PyArray_DIMS(weights)[3];
-        kD = PyArray_DIMS(weights)[4];
-    }}
-    else {{
-        if ({height} != -1) {{
-            // kernel height is specified (perhaps vertical subsampling or half padding)
-            kH = {height};
-        }}
-        else if (padH == -2) {{
-            // vertical full padding, we can infer the kernel height
-            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
-        }}
-        else {{
-            // explicit padding, we can infer the kernel height
-            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
-        }}
-        if ({width} != -1) {{
-            kW = {width};
-        }}
-        else if (padW == -2) {{
-            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }}
-        else {{
-            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }}
-        if ({depth} != -1) {{
-            kD = {depth};
-        }}
-        else if (padD == -2) {{
-            kD = (2 - PyArray_DIMS(bottom)[4] + (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
-        }}
-        else {{
-            kD = (PyArray_DIMS(bottom)[4] + 2*padD - (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
-        }}
-    }}
-
-    // Implicit dilated kernel size
-    dil_kH = (kH - 1) * dilH + 1;
-    dil_kW = (kW - 1) * dilW + 1;
-    dil_kD = (kD - 1) * dilD + 1;
-
-    // Auto-padding if requested
-    if (padH == -1) {{  // vertical half padding
-        padH = dil_kH / 2;
-    }}
-    else if (padH == -2) {{  // vertical full padding
-        padH = dil_kH - 1;
-    }}
-    else if (padH < 0) {{
-        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padH must be >= -2");
-        {fail}
-    }}
-    if (padW == -1) {{  // horizontal half padding
-        padW = dil_kW / 2;
-    }}
-    else if (padW == -2) {{  // horizontal full padding
-        padW = dil_kW - 1;
-    }}
-    else if (padW < 0) {{
-        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padW must be >= -2");
-        {fail}
-    }}
-    if (padD == -1) {{  // depth half padding
-        padD = dil_kD / 2;
-    }}
-    else if (padD == -2) {{  // depth full padding
-        padD = dil_kD - 1;
-    }}
-    else if (padD < 0) {{
-        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padD must be >= -2");
-        {fail}
-    }}
-
-    // Infer output shape
-    npy_intp out_dim[5];
-    switch(direction) {{
-    case 0:  // forward pass
-        // output is top: (batchsize, num_filters, height, width, depth)
-        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
-        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
-        out_dim[4] = (npy_intp)((PyArray_DIMS(bottom)[4] + 2*padD - ((PyArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1);
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
-        {{
-            PyErr_Format(PyExc_ValueError,
-                         "Corr3dMM: impossible output shape\\n"
-                         "  bottom shape: %ld x %ld x %ld x %ld x %ld\\n"
-                         "  weights shape: %ld x %ld x %ld x %ld x %ld\\n"
-                         "  top shape: %ld x %ld x %ld x %ld x %ld\\n",
-                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                         (long int)PyArray_DIMS(bottom)[4],
-                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                         (long int)PyArray_DIMS(weights)[4],
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)out_dim[3], (long int)out_dim[4]);
-            {fail}
-        }}
-        break;
-    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width, depth)
-        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
-        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
-        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
-        out_dim[2] = (npy_intp)kH;  // already inferred further above
-        out_dim[3] = (npy_intp)kW;  // how convenient
-        out_dim[4] = (npy_intp)kD;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
-        {{
-            PyErr_Format(PyExc_ValueError,
-                         "Corr3dMM backprop wrt. weights: impossible output shape\\n"
-                         "  bottom shape: %ld x %ld x %ld x %ld x %ld\\n"
-                         "  weights shape: %ld x %ld x %ld x %ld x %ld\\n"
-                         "  top shape: %ld x %ld x %ld x %ld x %ld\\n",
-                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                         (long int)PyArray_DIMS(bottom)[4],
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)out_dim[3], (long int)out_dim[4],
-                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3],
-                         (long int)PyArray_DIMS(top)[4]);
-            {fail}
-        }}
-        break;
-    case 2:  // backprop wrt. inputs
-        // output is bottom: (batchsize, num_channels, height, width, depth)
-        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
-        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
-        out_dim[2] = (npy_intp)(({height} != -1) ? {height} : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)(({width} != -1) ? {width} : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
-        out_dim[4] = (npy_intp)(({depth} != -1) ? {depth} : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
-        {{
-            PyErr_Format(PyExc_ValueError,
-                         "Corr3dMM backprop wrt. inputs: impossible output shape\\n"
-                         "  bottom shape: %ld x %ld x %ld x %ld x %ld\\n"
-                         "  weights shape: %ld x %ld x %ld x %ld x %ld\\n"
-                         "  top shape: %ld x %ld x %ld x %ld x %ld\\n",
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)out_dim[3], (long int)out_dim[4],
-                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                         (long int)PyArray_DIMS(weights)[4],
-                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3],
-                         (long int)PyArray_DIMS(top)[4]);
-            {fail}
-        }}
-        break;
-    default:
-        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n");
-        {fail}
-    }}
-
-    // Prepare output array
-    int typenum;
-    if ( !(*out
-           && PyArray_NDIM(*out)==4
-           && PyArray_IS_C_CONTIGUOUS(*out)
-           && PyArray_DIMS(*out)[0]==out_dim[0]
-           && PyArray_DIMS(*out)[1]==out_dim[1]
-           && PyArray_DIMS(*out)[2]==out_dim[2]
-           && PyArray_DIMS(*out)[3]==out_dim[3]
-           && PyArray_DIMS(*out)[4]==out_dim[4]))
-    {{
-        Py_XDECREF(*out);
-        if (direction != 1) {{
-          typenum = PyArray_TYPE(weights);
-        }}
-        else {{
-          typenum = PyArray_TYPE(bottom);
-        }}
-        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
-        *out = (PyArrayObject*)PyArray_ZEROS(5,
-                                          out_dim,
-                                          typenum,
-                                          0);
-        if (NULL == *out)
-        {{
-            PyErr_Format(PyExc_RuntimeError,
-                    "BaseCorr3dMM: Failed to allocate output of %lld x %lld x %lld x %lld x %lld",
-                    (long long)out_dim[0], (long long)out_dim[1],
-                    (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4]);
-            {fail}
-        }}
-    }}
-
-    // Call corr3dMM code
-    out2 = corr3dMM({bottom}, {weights}, {top}, direction,
-                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD,
-                    numgroups);
-    if (out2==NULL){{
-       {fail}
-    }}
-    assert (out2 == *out);
-
-"""
-
-
-class Corr3dMM(BaseCorr3dMM):
-    """
-    CPU correlation implementation using Matrix Multiplication.
-
-    Parameters
-    ----------
-    border_mode
-        The width of a border of implicit zeros to pad the
-        input with. Must be a tuple with 3 elements giving the width of
-        the padding on each side, or a single integer to pad the same
-        on all sides, or a string shortcut setting the padding at runtime:
-        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
-        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
-        (full convolution), ``'half'`` for ``(kernel_rows // 2,
-        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
-        odd-sized kernels). Note that the three widths are each
-        applied twice, once per side (left and right, top and bottom, front
-        and back).
-    subsample
-        The subsample operation applied to each output image. Should be a tuple
-        with 3 elements. Set to `(1, 1, 1)` to disable subsampling.
-    filter_dilation
-        The filter dilation operation applied to each input image.
-        Should be a tuple with 3 elements.
-        Set to `(1, 1, 1)` to disable filter dilation.
-    num_groups
-        Perform grouped convolutions (default: 1)
-    """
-
-    _direction = "forward"
-
-    def make_node(self, img, kern):
-        img = as_tensor_variable(img)
-        kern = as_tensor_variable(kern)
-        img, kern = self.as_common_dtype(img, kern)
-        if img.type.ndim != 5:
-            raise TypeError("img must be 5D tensor")
-        if kern.type.ndim != 5:
-            raise TypeError("kern must be 5D tensor")
-
-        out_shape = [
-            1 if img.type.shape[0] == 1 else None,
-            1 if kern.type.shape[0] == 1 else None,
-            None,
-            None,
-            None,
-        ]
-        dtype = img.type.dtype
-        return Apply(self, [img, kern], [TensorType(dtype, shape=out_shape)()])
-
-    def infer_shape(self, fgraph, node, input_shape):
-        imshp = input_shape[0]
-        kshp = input_shape[1]
-        res = get_conv_output_shape(
-            imshp, kshp, self.border_mode, self.subsample, self.filter_dilation
-        )
-        return [res]
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, weights = inp
-        (top,) = out_
-        return super().c_code_helper(bottom, weights, top, sub)
-
-    def grad(self, inp, grads):
-        bottom, weights = inp
-        (top,) = grads
-        d_bottom = Corr3dMMGradInputs(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            num_groups=self.num_groups,
-        )(weights, top, bottom.shape[-3:])
-        d_weights = Corr3dMMGradWeights(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            num_groups=self.num_groups,
-        )(bottom, top, weights.shape[-3:])
-        return d_bottom, d_weights
-
-
-class Corr3dMMGradWeights(BaseCorr3dMM):
-    """
-    Gradient wrt. filters for `Corr3dMM`.
-
-    Notes
-    -----
-    You will not want to use this directly, but rely on
-    PyTensor's automatic differentiation or graph optimization to
-    use it as needed.
-
-    """
-
-    _direction = "backprop weights"
-
-    def make_node(self, img, topgrad, shape=None):
-        img = as_tensor_variable(img)
-        topgrad = as_tensor_variable(topgrad)
-        img, topgrad = self.as_common_dtype(img, topgrad)
-        if img.type.ndim != 5:
-            raise TypeError("img must be 5D tensor")
-        if topgrad.type.ndim != 5:
-            raise TypeError("topgrad must be 5D tensor")
-        if shape is None:
-            if self.subsample != (1, 1, 1) or self.border_mode == "half":
-                raise ValueError(
-                    "shape must be given if subsample != (1, 1, 1)"
-                    ' or border_mode == "half"'
-                )
-            height_width_depth = []
-        else:
-            height_width_depth = [
-                as_tensor_variable(shape[0]).astype("int64"),
-                as_tensor_variable(shape[1]).astype("int64"),
-                as_tensor_variable(shape[2]).astype("int64"),
-            ]
-
-        out_shape = [
-            1 if topgrad.type.shape[1] == 1 else None,
-            1 if img.type.shape[1] == 1 else None,
-            None,
-            None,
-            None,
-        ]
-        dtype = img.type.dtype
-        return Apply(
-            self,
-            [img, topgrad, *height_width_depth],
-            [TensorType(dtype, shape=out_shape)()],
-        )
-
-    def infer_shape(self, fgraph, node, input_shape):
-        if self.border_mode == "half":
-            padH = padW = padD = -1
-        elif self.border_mode == "full":
-            padH = padW = padD = -2
-        elif isinstance(self.border_mode, tuple):
-            padH, padW, padD = self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            padH = padW = padD = 0
-        dH, dW, dD = self.subsample
-        imshp = input_shape[0]
-        topshp = input_shape[1]
-        ssize, imshp = imshp[1], list(imshp[2:])
-        ssize = ssize // self.num_groups
-        nkern, topshp = topshp[1], list(topshp[2:])
-        height_width_depth = node.inputs[-3:]
-        if (dH != 1) or (padH == -1):
-            # vertical subsampling or half padding, kernel height is specified
-            kH = height_width_depth[0]
-        elif padH == -2:
-            # vertical full padding, we can infer the kernel height
-            kH = 2 - imshp[0] + (topshp[0] - 1) * dH
-        else:
-            # explicit padding, we can infer the kernel height
-            kH = imshp[0] + 2 * padH - (topshp[0] - 1) * dH
-        if (dW != 1) or (padW == -1):
-            kW = height_width_depth[1]
-        elif padW == -2:
-            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
-        else:
-            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
-        if (dD != 1) or (padD == -1):
-            kD = height_width_depth[2]
-        elif padD == -2:
-            kD = 2 - imshp[2] + (topshp[2] - 1) * dD
-        else:
-            kD = imshp[2] + 2 * padD - (topshp[2] - 1) * dD
-        return [(nkern, ssize, kH, kW, kD)]
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, top = inp[:2]
-        height, width, depth = inp[2:] or (None, None, None)
-        (weights,) = out_
-        return super().c_code_helper(bottom, weights, top, sub, height, width, depth)
-
-    def grad(self, inp, grads):
-        bottom, top = inp[:2]
-        (weights,) = grads
-        d_bottom = Corr3dMMGradInputs(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            num_groups=self.num_groups,
-        )(weights, top, bottom.shape[-3:])
-        d_top = Corr3dMM(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            num_groups=self.num_groups,
-        )(bottom, weights)
-        d_height_width_depth = (
-            (pytensor.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
-        )
-        return (d_bottom, d_top, *d_height_width_depth)
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
-
-
-class Corr3dMMGradInputs(BaseCorr3dMM):
-    """
-    Gradient wrt. inputs for `Corr3dMM`.
-
-    Notes
-    -----
-    You will not want to use this directly, but rely on
-    PyTensor's automatic differentiation or graph optimization to
-    use it as needed.
-
-    """
-
-    _direction = "backprop inputs"
-
-    def make_node(self, kern, topgrad, shape=None):
-        kern = as_tensor_variable(kern)
-        topgrad = as_tensor_variable(topgrad)
-        kern, topgrad = self.as_common_dtype(kern, topgrad)
-        if kern.type.ndim != 5:
-            raise TypeError("kern must be 5D tensor")
-        if topgrad.type.ndim != 5:
-            raise TypeError("topgrad must be 5D tensor")
-        if shape is None:
-            if self.subsample != (1, 1, 1):
-                raise ValueError("shape must be given if subsample != (1, 1, 1)")
-            height_width_depth = []
-        else:
-            height_width_depth = [
-                as_tensor_variable(shape[0]).astype("int64"),
-                as_tensor_variable(shape[1]).astype("int64"),
-                as_tensor_variable(shape[2]).astype("int64"),
-            ]
-
-        if self.num_groups > 1:
-            out_shape = [
-                1 if topgrad.type.shape[0] == 1 else None,
-                None,
-                None,
-                None,
-                None,
-            ]
-        else:
-            out_shape = [
-                1 if topgrad.type.shape[0] == 1 else None,
-                1 if kern.type.shape[1] == 1 else None,
-                False,
-                False,
-                False,
-            ]
-        dtype = kern.type.dtype
-        return Apply(
-            self,
-            [kern, topgrad, *height_width_depth],
-            [TensorType(dtype, shape=out_shape)()],
-        )
-
-    def infer_shape(self, fgraph, node, input_shape):
-        if self.border_mode == "half":
-            padH = padW = padD = -1
-        elif self.border_mode == "full":
-            padH = padW = padD = -2
-        elif isinstance(self.border_mode, tuple):
-            padH, padW, padD = self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            padH = padW = padD = 0
-        dH, dW, dD = self.subsample
-        kshp = input_shape[0]
-        topshp = input_shape[1]
-        ssize, kshp = kshp[1], list(kshp[2:])
-        ssize = ssize * self.num_groups
-        bsize, topshp = topshp[0], list(topshp[2:])
-        height_width_depth = node.inputs[-3:]
-        if padH == -1:
-            padH = kshp[0] // 2
-        elif padH == -2:
-            padH = kshp[0] - 1
-        elif padH < -2:
-            raise ValueError("Corr3dMM_gradInputs: border_mode must be >= 0.")
-        if padW == -1:
-            padW = kshp[1] // 2
-        elif padW == -2:
-            padW = kshp[1] - 1
-        elif padW < -2:
-            raise ValueError("Corr3dMM_gradInputs: border_mode must be >= 0.")
-        if padD == -1:
-            padD = kshp[2] // 2
-        elif padD == -2:
-            padD = kshp[2] - 1
-        elif padD < -2:
-            raise ValueError("Corr3dMM_gradInputs: border_mode must be >= 0.")
-
-        if dH != 1:
-            out_shp0 = height_width_depth[0]
-        else:
-            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - 2 * padH
-        if dW != 1:
-            out_shp1 = height_width_depth[1]
-        else:
-            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - 2 * padW
-        if dD != 1:
-            out_shp2 = height_width_depth[2]
-        else:
-            out_shp2 = (topshp[2] - 1) * dD + kshp[2] - 2 * padD
-        out_shp = (out_shp0, out_shp1, out_shp2)
-        return [(bsize, ssize, *out_shp)]
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        weights, top = inp[:2]
-        height, width, depth = inp[2:] or (None, None, None)
-        (bottom,) = out_
-        return super().c_code_helper(bottom, weights, top, sub, height, width, depth)
-
-    def grad(self, inp, grads):
-        weights, top = inp[:2]
-        (bottom,) = grads
-        d_weights = Corr3dMMGradWeights(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            num_groups=self.num_groups,
-        )(bottom, top, weights.shape[-3:])
-        d_top = Corr3dMM(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            num_groups=self.num_groups,
-        )(bottom, weights)
-        d_height_width_depth = (
-            (pytensor.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
-        )
-        return (d_weights, d_top, *d_height_width_depth)
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
--- a/tests/tensor/conv/c_conv_corr_ref.py
+++ b/tests/tensor/conv/c_conv_corr_ref.py
-import logging
-from pathlib import Path
-
-import pytensor
-from pytensor.configdefaults import config
-from pytensor.graph.basic import Apply
-from pytensor.graph.op import _NoPythonOp
-from pytensor.link.c.op import OpenMPOp
-from pytensor.link.c.params_type import ParamsType
-from pytensor.link.c.type import EnumList
-from pytensor.scalar import int8, int64
-from pytensor.tensor import blas_headers
-from pytensor.tensor.basic import as_tensor_variable
-from pytensor.tensor.blas import blas_header_version, ldflags
-from pytensor.tensor.conv.abstract_conv import get_conv_output_shape
-from pytensor.tensor.type import TensorType
-
-
-_logger = logging.getLogger(__name__)
-
-C_CODE_PATH = Path(__file__).parent / "c_code"
-
-
-class BaseCorrMM(OpenMPOp, _NoPythonOp):
-    """
-    Base class for `CorrMM`, `CorrMM_gradWeights` and
-    `CorrMM_gradInputs`. Cannot be used directly.
-
-    Every sub-class must define internal attribute ``_direction`` out of __init__().
-    ``_direction`` must take one of following values:
-
-     - "forward" to correlate bottom with weights and store results in top.
-     - "backprop weights" to do a valid convolution of bottom with top
-       (swapping the first two dimensions) and store results in weights.
-     - "backprop inputs" to do a full convolution of top with weights
-       (swapping the first two dimensions) and store results in bottom.
-
-    Parameters
-    ----------
-    border_mode : {'valid', 'full', 'half'}
-        Additionally, the padding size could be directly specified by an integer,
-        a pair of integers, or two pairs of integers.
-    subsample
-        Perform subsampling of the output (default: (1, 1)).
-    filter_dilation
-        Perform dilated correlation (default: (1,1))
-    num_groups
-        Perform grouped convolutions (default: 1)
-    unshared
-        Perform unshared correlation (default: False)
-    """
-
-    check_broadcast = False
-    __props__ = (
-        "border_mode",
-        "subsample",
-        "filter_dilation",
-        "num_groups",
-        "unshared",
-    )
-
-    _direction: str | None = None
-
-    params_type = ParamsType(
-        direction=EnumList(
-            ("DIRECTION_FORWARD", "forward"),  # 0
-            ("DIRECTION_BACKPROP_WEIGHTS", "backprop weights"),  # 1
-            ("DIRECTION_BACKPROP_INPUTS", "backprop inputs"),
-        ),  # 2
-        dH=int64,
-        dW=int64,
-        dilH=int64,
-        dilW=int64,
-        padH_l=int64,
-        padH_r=int64,
-        padW_l=int64,
-        padW_r=int64,
-        num_groups=int64,
-        unshared=int8,
-    )
-
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1),
-        filter_dilation=(1, 1),
-        num_groups=1,
-        unshared=False,
-        openmp=None,
-    ):
-        super().__init__(openmp=openmp)
-        if isinstance(border_mode, int):
-            if border_mode < 0:
-                raise ValueError(
-                    f"invalid border_mode {border_mode}, which must be a "
-                    "non-negative integer"
-                )
-            border_mode = ((border_mode, border_mode),) * 2
-        elif isinstance(border_mode, tuple):
-            if len(border_mode) != 2:
-                raise ValueError(
-                    f"invalid border_mode {border_mode} which must be a "
-                    "tuple of length 2"
-                )
-            border = ()
-            for mode in border_mode:
-                if isinstance(mode, tuple) and len(mode) == 2 and min(mode) >= 0:
-                    border += ((int(mode[0]), int(mode[1])),)
-                elif mode >= 0:
-                    border += ((int(mode), int(mode)),)
-                else:
-                    raise ValueError(
-                        f"invalid border mode {border_mode}. The tuple can only contain "
-                        "integers or tuples of length 2"
-                    )
-            border_mode = border
-        elif border_mode not in ("valid", "full", "half"):
-            raise ValueError(
-                f"invalid border_mode {border_mode}, which must be either "
-                '"valid", "full", "half", an integer or a tuple '
-                "of two integers or a pair of integers"
-            )
-        self.border_mode = border_mode
-        if len(subsample) != 2:
-            raise ValueError("subsample must have two elements")
-        if len(filter_dilation) != 2:
-            raise ValueError("filter_dilation must have two elements")
-        self.subsample = tuple(subsample)
-        self.filter_dilation = tuple(filter_dilation)
-        self.unshared = unshared
-
-        if not config.blas__ldflags:
-            # PyTensor will use a NumPy C implementation of [sd]gemm_ instead.
-            self.blas_type = ""
-        else:
-            if "openblas" in config.blas__ldflags:
-                self.blas_type = "openblas"
-            elif "mkl" in config.blas__ldflags:
-                self.blas_type = "mkl"
-            else:
-                self.blas_type = ""
-
-        if self._direction not in ("forward", "backprop weights", "backprop inputs"):
-            raise ValueError(
-                "_direction must be one of 'forward', "
-                "'backprop weights', 'backprop inputs'"
-            )
-        if num_groups < 1:
-            raise ValueError("Number of groups should be greater than 0")
-        self.num_groups = num_groups
-
-    @property
-    def pad(self):
-        if self.border_mode == "half":
-            return ((-1, -1),) * 2
-        elif self.border_mode == "full":
-            return ((-2, -2),) * 2
-        elif isinstance(self.border_mode, tuple):
-            return self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            return ((0, 0),) * 2
-
-    # Direction should be converted to real enum value,
-    # as it is compared to integer later in c_code_helper().
-    direction = property(lambda self: self.params_type.enum_from_alias(self._direction))
-
-    dH = property(lambda self: self.subsample[0])
-    dW = property(lambda self: self.subsample[1])
-
-    dilH = property(lambda self: self.filter_dilation[0])
-    dilW = property(lambda self: self.filter_dilation[1])
-
-    padH_l = property(lambda self: self.pad[0][0])
-    padH_r = property(lambda self: self.pad[0][1])
-    padW_l = property(lambda self: self.pad[1][0])
-    padW_r = property(lambda self: self.pad[1][1])
-
-    def __str__(self):
-        return f"{self.__class__.__name__}{{{self.border_mode}, {self.subsample!s}, {self.filter_dilation!s}, {self.num_groups!s} {self.unshared!s}}}"
-
-    @staticmethod
-    def as_common_dtype(in1, in2):
-        """
-        Upcast input variables if necessary.
-        """
-        dtype = pytensor.scalar.upcast(in1.dtype, in2.dtype)
-        return in1.astype(dtype), in2.astype(dtype)
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "num_groups"):
-            self.num_groups = 1
-
-    def c_support_code(self, **kwargs):
-        ccodes = blas_headers.blas_header_text()
-        if self.blas_type == "openblas":
-            ccodes += blas_headers.openblas_threads_text()
-        elif self.blas_type == "mkl":
-            ccodes += blas_headers.mkl_threads_text()
-        return ccodes
-
-    def c_libraries(self, **kwargs):
-        return ldflags()
-
-    def c_compile_args(self, **kwargs):
-        compile_args = ldflags(libs=False, flags=True)
-        compile_args += super().c_compile_args(**kwargs)
-        return compile_args
-
-    def c_lib_dirs(self, **kwargs):
-        return ldflags(libs=False, libs_dir=True)
-
-    def c_header_dirs(self, **kwargs):
-        return ldflags(libs=False, include_dir=True)
-
-    def c_headers(self, **kwargs):
-        headers = ["<stdio.h>"]
-        headers += super().c_headers(**kwargs)
-        return headers
-
-    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (10, self.openmp, blas_header_version())
-
-    def c_support_code_apply(self, node, nodename):
-        # REMEMBER TO RAISE c_code_cache_version when changing any of
-        # these files
-        sub = {}
-        dtype = str(node.__dict__["inputs"][0].dtype)
-        assert dtype in ("float32", "float64")
-        if dtype == "float32":
-            sub["gemm"] = "sgemm_"
-            sub["gemv"] = "sgemv_"
-            sub["float_type"] = "npy_float"
-            sub["float_typenum"] = "NPY_FLOAT"
-            sub["n_bytes"] = 4
-            sub["c_float_type"] = "float"
-        else:
-            sub["gemm"] = "dgemm_"
-            sub["gemv"] = "dgemv_"
-            sub["float_type"] = "npy_double"
-            sub["float_typenum"] = "NPY_DOUBLE"
-            sub["n_bytes"] = 8
-            sub["c_float_type"] = "double"
-
-        if self.openmp:
-            sub["omp_flags"] = "#pragma omp parallel for schedule(static)"
-            sub["omp_get_max_threads"] = "omp_get_max_threads()"
-            sub["omp_get_thread_num"] = "omp_get_thread_num()"
-
-            if self.blas_type == "openblas":
-                sub["blas_set_num_threads"] = "openblas_set_num_threads"
-                sub["blas_get_num_threads"] = "openblas_get_num_threads()"
-            elif self.blas_type == "mkl":
-                sub["blas_set_num_threads"] = "mkl_set_num_threads"
-                sub["blas_get_num_threads"] = "mkl_get_max_threads()"
-            else:
-                sub["blas_set_num_threads"] = ""
-                sub["blas_get_num_threads"] = "0"
-        else:
-            sub["omp_flags"] = ""
-            sub["omp_get_max_threads"] = "1"
-            sub["omp_get_thread_num"] = "0"
-            sub["blas_set_num_threads"] = ""
-            sub["blas_get_num_threads"] = "0"
-
-        final_code = (C_CODE_PATH / "corr_gemm.c").read_text("utf-8")
-        return final_code % sub
-
-    def c_code_helper(self, bottom, weights, top, sub, height=None, width=None):
-        """
-        This generates the C code for CorrMM (direction="forward"),
-        CorrMM_gradWeights (direction="backprop weights"), and
-        CorrMM_gradInputs (direction="backprop inputs").
-        Depending on the direction, one of bottom, weights, top will
-        receive the output, while the other two serve as inputs.
-
-        :param bottom: Variable name of the input images in the forward pass,
-            or the gradient of the input images in backprop wrt. inputs
-        :param weights: Variable name of the filters in the forward pass,
-            or the gradient of the filters in backprop wrt. weights
-        :param top: Variable name of the output images / feature maps in the
-            forward pass, or the gradient of the outputs in the backprop passes
-        :param sub: Dictionary of substitutions usable to help generating the
-            C code.
-        :param height: If self.subsample[0] != 1, a variable giving the height
-            of the filters for direction="backprop weights" or the height of
-            the input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the height of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        :param width: If self.subsample[1] != 1, a variable giving the width
-            of the filters for direction="backprop weights" or the width of the
-            input images for direction="backprop inputs".
-
-            If self.border_mode == 'half', a variable giving the width of the
-            filters for direction="backprop weights".  Ignored otherwise.
-        """
-
-        # When subsampling, we cannot unambiguously infer the height and width
-        # of bottom and weights from top, so we require them to be given.
-        # Similarly, when border_mode="half", we cannot infer the weight size.
-        if height:
-            height = f"(*(npy_int64 *)(PyArray_DATA({height})))"
-        else:
-            if ((self.direction != 0) and (self.dH != 1)) or (
-                (self.direction == 1) and (self.padH_l == -1 or self.padH_r == -1)
-            ):
-                raise ValueError(
-                    "height must be given for backprop with vertical sampling or border_mode='half'"
-                )
-            height = "-1"
-        if width:
-            width = f"(*(npy_int64 *)(PyArray_DATA({width})))"
-        else:
-            if ((self.direction != 0) and (self.dW != 1)) or (
-                (self.direction == 1) and (self.padW_l == -1 or self.padW_r == -1)
-            ):
-                raise ValueError(
-                    "width must be given for backprop with horizontal sampling or border_mode='half'"
-                )
-            width = "-1"
-
-        fail = sub["fail"]
-        params = sub["params"]
-        return f"""
-    // Mandatory args
-    int direction = {params}->direction;  // forward, bprop weights, bprop inputs
-
-    // Optional args
-    int dH = {params}->dH;
-    int dW = {params}->dW;
-    int dilH = {params}->dilH;
-    int dilW = {params}->dilW;
-    int padH_l = {params}->padH_l;
-    int padH_r = {params}->padH_r;
-    int padW_l = {params}->padW_l;
-    int padW_r = {params}->padW_r;
-    int numgroups = {params}->num_groups;
-    int unshared = {params}->unshared;
-
-    PyArrayObject * bottom = {bottom};
-    PyArrayObject * weights = {weights};
-    PyArrayObject * top = {top};
-    PyArrayObject * out2 = NULL;
-    PyArrayObject **out = NULL;
-
-    switch({params}->direction) {{
-        case DIRECTION_FORWARD:
-            out = &{top};
-            break;
-        case DIRECTION_BACKPROP_WEIGHTS:
-            out = &{weights};
-            break;
-        case DIRECTION_BACKPROP_INPUTS:
-            out = &{bottom};
-            break;
-        default:
-            PyErr_SetString(PyExc_ValueError, "CPU CorrMM: Invalid direction.");
-            {{{fail}}}
-            break;
-    }}
-
-    int wdim, odim;
-    wdim = unshared ? 6 : 4;
-    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
-
-    // Obtain or infer kernel width and height
-    // (we need to know it early to be able to handle auto-padding)
-    int kH, kW, dil_kH, dil_kW;
-    if (direction != 1) {{
-        // weight is an input variable, we can just read its shape
-        kH = PyArray_DIMS(weights)[wdim-2];
-        kW = PyArray_DIMS(weights)[wdim-1];
-    }}
-    else {{
-        if ({height} != -1) {{
-            // kernel height is specified (perhaps vertical subsampling or half padding)
-            kH = {height};
-        }}
-        else if (padH_l == -2 || padH_r == -2) {{
-            // vertical full padding, we can infer the kernel height
-            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
-        }}
-        else {{
-            // explicit padding, we can infer the kernel height
-            kH = (PyArray_DIMS(bottom)[2] + padH_l + padH_r - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
-        }}
-        if ({width} != -1) {{
-            // kernel width is specified (perhaps horizontal subsampling or half padding)
-            kW = {width};
-        }}
-        else if (padW_l == -2 || padW_r == -2) {{
-            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }}
-        else {{
-            kW = (PyArray_DIMS(bottom)[3] + padW_l + padW_r - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }}
-    }}
-
-    // Implicit dilated kernel size
-    dil_kH = (kH - 1) * dilH + 1;
-    dil_kW = (kW - 1) * dilW + 1;
-
-    // Auto-padding if requested
-    if (padH_l == -1 || padH_r == -1) {{  // vertical half padding
-        padH_l = padH_r = dil_kH / 2;
-    }}
-    else if (padH_l == -2 || padH_r == -2) {{  // vertical full padding
-        padH_l = padH_r = dil_kH - 1;
-    }}
-    else if (padH_l < -2 || padH_r < -2) {{
-        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH_l and padH_r must be >= -2");
-        {fail}
-    }}
-    if (padW_l == -1 || padW_r == -1) {{  // horizontal half padding
-        padW_l = padW_r = dil_kW / 2;
-    }}
-    else if (padW_l == -2 || padW_r == -2) {{  // horizontal full padding
-        padW_l = padW_r = dil_kW - 1;
-    }}
-    else if (padW_l < -2 || padW_r < -2) {{
-        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW_l and padW_r must be >= -2");
-        {fail}
-    }}
-
-    // Infer output shape
-    npy_intp out_dim[6];
-    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
-    switch(direction) {{
-    case 0:  // forward pass
-        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
-        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-        {{
-            if (unshared) {{
-                PyErr_Format(PyExc_ValueError,
-                             "CorrMM: impossible output shape\\n"
-                             "  bottom shape: %ld x %ld x %ld x %ld\\n"
-                             "  weights shape: %ld x %ld x %ld x %ld x %ld x %ld\\n"
-                             "  top shape: %ld x %ld x %ld x %ld\\n",
-                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
-                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                             (long int)out_dim[3]);
-            }}
-            else {{
-                PyErr_Format(PyExc_ValueError,
-                             "CorrMM: impossible output shape\\n"
-                             "  bottom shape: %ld x %ld x %ld x %ld\\n"
-                             "  weights shape: %ld x %ld x %ld x %ld\\n"
-                             "  top shape: %ld x %ld x %ld x %ld\\n",
-                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                             (long int)out_dim[3]);
-            }}
-            {fail}
-        }}
-        break;
-    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1
-        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
-        if (unshared){{
-            odim = 6;
-            out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
-            out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
-        }}
-        out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
-        out_dim[wdim-2] = (npy_intp)kH;  // already inferred further above
-        out_dim[wdim-1] = (npy_intp)kW;  // how convenient
-        if (unshared) {{
-            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
-                    || out_dim[4] <= 0 || out_dim[5] <= 0){{
-                PyErr_Format(PyExc_ValueError,
-                             "CorrMM backprop wrt. weights: impossible output shape\\n"
-                             "  bottom shape: %ld x %ld x %ld x %ld\\n"
-                             "  weights shape: %ld x %ld x %ld x %ld x %ld x %ld\\n"
-                             "  top shape: %ld x %ld x %ld x %ld\\n",
-                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                             (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
-                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
-                {fail}
-            }}
-        }}
-        else {{
-            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            {{
-                PyErr_Format(PyExc_ValueError,
-                             "CorrMM backprop wrt. weights: impossible output shape\\n"
-                             "  bottom shape: %ld x %ld x %ld x %ld\\n"
-                             "  weights shape: %ld x %ld x %ld x %ld\\n"
-                             "  top shape: %ld x %ld x %ld x %ld\\n",
-                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                             (long int)out_dim[3],
-                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
-                {fail}
-            }}
-        }}
-        break;
-    case 2:  // backprop wrt. inputs
-        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
-        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (npy_intp)(({height} != -1) ? {height} : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r);
-        out_dim[3] = (npy_intp)(({width} != -1) ? {width} : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r);
-        if (unshared) {{
-            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            {{
-                PyErr_Format(PyExc_ValueError,
-                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
-                             "  bottom shape: %ld x %ld x %ld x %ld\\n"
-                             "  weights shape: %ld x %ld x %ld x %ld x %ld x %ld\\n"
-                             "  top shape: %ld x %ld x %ld x %ld\\n",
-                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                             (long int)out_dim[3],
-                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
-                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
-                {fail}
-            }}
-        }}
-        else {{
-            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            {{
-                PyErr_Format(PyExc_ValueError,
-                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
-                             "  bottom shape: %ld x %ld x %ld x %ld\\n"
-                             "  weights shape: %ld x %ld x %ld x %ld\\n"
-                             "  top shape: %ld x %ld x %ld x %ld\\n",
-                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                             (long int)out_dim[3],
-                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
-                {fail}
-            }}
-        }}
-        break;
-    default:
-        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
-        {fail}
-    }}
-
-    // Prepare output array
-    int typenum;
-    int failure;
-    failure = !(*out
-           && PyArray_NDIM(*out)==odim
-           && PyArray_IS_C_CONTIGUOUS(*out)
-           && PyArray_DIMS(*out)[0]==out_dim[0]
-           && PyArray_DIMS(*out)[1]==out_dim[1]
-           && PyArray_DIMS(*out)[2]==out_dim[2]
-           && PyArray_DIMS(*out)[3]==out_dim[3]);
-    if (odim == 6){{
-        failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
-                && PyArray_DIMS(*out)[5]==out_dim[5]);
-    }}
-    if ( failure )
-    {{
-        Py_XDECREF(*out);
-        if (direction != 1) {{
-          typenum = PyArray_TYPE(weights);
-        }}
-        else {{
-          typenum = PyArray_TYPE(bottom);
-        }}
-        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
-        *out = (PyArrayObject*)PyArray_ZEROS(odim,
-                                          out_dim,
-                                          typenum,
-                                          0);
-        if (NULL == *out)
-        {{
-            if (odim == 4) {{
-                PyErr_Format(PyExc_RuntimeError,
-                        "BaseCorrMM: Failed to allocate output of %lld x %lld x %lld x %lld",
-                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
-            }}
-            if (odim == 6) {{
-                PyErr_Format(PyExc_RuntimeError,
-                        "BaseCorrMM: Failed to allocate output of %lld x %lld x %lld x %lld %lld %lld",
-                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
-                        (long long)out_dim[4], (long long)out_dim[5]);
-            }}
-            {fail}
-        }}
-    }}
-
-    // Call corrMM code
-    out2 = corrMM({bottom}, {weights}, {top}, direction, dH, dW, dilH, dilW,
-                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
-    if (out2==NULL){{
-       {fail}
-    }}
-    assert (out2 == *out);
-
-"""
-
-
-class CorrMM(BaseCorrMM):
-    """
-    CPU correlation implementation using Matrix Multiplication.
-
-    Parameters
-    ----------
-    border_mode
-        The width of a border of implicit zeros to pad the
-        input with. Must be a tuple with 2 elements giving the numbers of rows
-        and columns to pad on each side, or a single integer to pad the same
-        on all sides, or a string shortcut setting the padding at runtime:
-        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
-        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
-        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
-        convolution for odd-sized kernels).
-        If it is a tuple containing 2 pairs of integers, then these specify
-        the padding to be applied on each side ((left, right), (top, bottom)).
-        Otherwise, each width is applied twice, once per side (left and right,
-        top and bottom).
-    subsample
-        The subsample operation applied to each output image.
-        Should be a tuple with 2 elements.
-        `(sv, sh)` is equivalent to `CorrMM(...)(...)[:,:,::sv, ::sh]`,
-        but faster.
-        Set to `(1, 1)` to disable subsampling.
-    filter_dilation
-        The filter dilation operation applied to each input image.
-        Should be a tuple with 2 elements.
-        Set to `(1, 1)` to disable filter dilation.
-    num_groups
-        Divides the image, kernel and output tensors into num_groups
-        separate groups. Each which carry out convolutions separately.
-        Should be an integer.
-    unshared
-        Boolean value. If true, then a different filter will be applied to
-        each region of the input image.
-
-    """
-
-    _direction = "forward"
-
-    def make_node(self, img, kern):
-        img = as_tensor_variable(img)
-        kern = as_tensor_variable(kern)
-        img, kern = self.as_common_dtype(img, kern)
-        if img.type.ndim != 4:
-            raise TypeError("img must be 4D tensor")
-        if self.unshared is True:
-            if kern.type.ndim != 6:
-                raise TypeError("kern must be 6D tensor")
-        else:
-            if kern.type.ndim != 4:
-                raise TypeError("kern must be 4D tensor")
-
-        out_shape = (
-            1 if img.type.shape[0] == 1 else None,
-            1 if kern.type.shape[0] == 1 else None,
-            None,
-            None,
-        )
-        dtype = img.type.dtype
-        return Apply(self, [img, kern], [TensorType(dtype, shape=out_shape)()])
-
-    def infer_shape(self, fgraph, node, input_shape):
-        imshp = input_shape[0]
-        kshp = input_shape[1]
-        res = get_conv_output_shape(
-            imshp, kshp, self.border_mode, self.subsample, self.filter_dilation
-        )
-        return [res]
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, weights = inp
-        (top,) = out_
-        return super().c_code_helper(bottom, weights, top, sub)
-
-    def grad(self, inp, grads):
-        bottom, weights = inp
-        (top,) = grads
-        d_bottom = CorrMM_gradInputs(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(weights, top, bottom.shape[-2:])
-        d_weights = CorrMM_gradWeights(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, top, weights.shape[-2:])
-        return d_bottom, d_weights
-
-
-class CorrMM_gradWeights(BaseCorrMM):
-    """
-    Gradient wrt. filters for `CorrMM`.
-
-    Notes
-    -----
-    You will not want to use this directly, but rely on
-    PyTensor's automatic differentiation or graph optimization to
-    use it as needed.
-
-    """
-
-    _direction = "backprop weights"
-
-    def make_node(self, img, topgrad, shape=None):
-        img = as_tensor_variable(img)
-        topgrad = as_tensor_variable(topgrad)
-        img, topgrad = self.as_common_dtype(img, topgrad)
-        if img.type.ndim != 4:
-            raise TypeError("img must be 4D tensor")
-        if topgrad.type.ndim != 4:
-            raise TypeError("topgrad must be 4D tensor")
-        if shape is None:
-            if self.subsample != (1, 1) or self.border_mode == "half":
-                raise ValueError(
-                    "shape must be given if subsample != (1, 1)"
-                    ' or border_mode == "half"'
-                )
-            height_width = []
-        else:
-            height_width = [
-                as_tensor_variable(shape[0]).astype("int64"),
-                as_tensor_variable(shape[1]).astype("int64"),
-            ]
-
-        if self.unshared is True:
-            out_shape = [
-                1 if topgrad.type.shape[1] == 1 else None,
-                None,
-                None,
-                1 if img.type.shape[1] == 1 else None,
-                None,
-                None,
-            ]
-        else:
-            out_shape = [
-                1 if topgrad.type.shape[1] == 1 else None,
-                1 if img.type.shape[1] == 1 else None,
-                None,
-                None,
-            ]
-
-        dtype = img.type.dtype
-        return Apply(
-            self, [img, topgrad, *height_width], [TensorType(dtype, shape=out_shape)()]
-        )
-
-    def infer_shape(self, fgraph, node, input_shape):
-        if self.border_mode == "half":
-            padH_l = padH_r = padW_l = padW_r = -1
-        elif self.border_mode == "full":
-            padH_l = padH_r = padW_l = padW_r = -2
-        elif isinstance(self.border_mode, tuple):
-            border = ()
-            for mode in self.border_mode:
-                if isinstance(mode, tuple):
-                    border += ((int(mode[0]), int(mode[1])),)
-                else:
-                    border += ((int(mode), int(mode)),)
-            (padH_l, padH_r), (padW_l, padW_r) = border
-        else:
-            assert self.border_mode == "valid"
-            padH_l = padH_r = padW_l = padW_r = 0
-        dH, dW = self.subsample
-        imshp = input_shape[0]
-        topshp = input_shape[1]
-        ssize, imshp = imshp[1], list(imshp[2:])
-        ssize = ssize // self.num_groups
-        nkern, topshp = topshp[1], list(topshp[2:])
-        height_width = node.inputs[-2:]
-        if (dH != 1) or (padH_l == -1) or (padH_r == -1):
-            # vertical subsampling or half padding, kernel height is specified
-            kH = height_width[0]
-        elif (padH_l == -2) or (padH_r == -2):
-            # vertical full padding, we can infer the kernel height
-            kH = 2 - imshp[0] + (topshp[0] - 1) * dH
-        else:
-            # explicit padding, we can infer the kernel height
-            kH = imshp[0] + padH_l + padH_r - (topshp[0] - 1) * dH
-        if (dW != 1) or (padW_l == -1) or (padW_r == -1):
-            kW = height_width[1]
-        elif (padW_l == -2) or (padW_r == -2):
-            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
-        else:
-            kW = imshp[1] + padW_l + padW_r - (topshp[1] - 1) * dW
-        if self.unshared is True:
-            return [(nkern, topshp[0], topshp[1], ssize, kH, kW)]
-        else:
-            return [(nkern, ssize, kH, kW)]
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, top = inp[:2]
-        height, width = inp[2:] or (None, None)
-        (weights,) = out_
-        return super().c_code_helper(bottom, weights, top, sub, height, width)
-
-    def grad(self, inp, grads):
-        bottom, top = inp[:2]
-        (weights,) = grads
-        d_bottom = CorrMM_gradInputs(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(weights, top, bottom.shape[-2:])
-        d_top = CorrMM(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, weights)
-        d_height_width = (
-            (pytensor.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
-        )
-        return (d_bottom, d_top, *d_height_width)
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0]]  # no connection to height, width
-
-
-class CorrMM_gradInputs(BaseCorrMM):
-    """
-    Gradient wrt. inputs for `CorrMM`.
-
-    Notes
-    -----
-    You will not want to use this directly, but rely on
-    PyTensor's automatic differentiation or graph optimization to
-    use it as needed.
-
-    """
-
-    _direction = "backprop inputs"
-
-    def make_node(self, kern, topgrad, shape=None):
-        kern = as_tensor_variable(kern)
-        topgrad = as_tensor_variable(topgrad)
-        kern, topgrad = self.as_common_dtype(kern, topgrad)
-        if self.unshared is True:
-            if kern.type.ndim != 6:
-                raise TypeError("kern must be 6D tensor")
-        else:
-            if kern.type.ndim != 4:
-                raise TypeError("kern must be 4D tensor")
-        if topgrad.type.ndim != 4:
-            raise TypeError("topgrad must be 4D tensor")
-        if shape is None:
-            if self.subsample != (1, 1):
-                raise ValueError("shape must be given if subsample != (1, 1)")
-            height_width = []
-        else:
-            height_width = [
-                as_tensor_variable(shape[0]).astype("int64"),
-                as_tensor_variable(shape[1]).astype("int64"),
-            ]
-
-        if self.num_groups > 1:
-            out_shape = [1 if topgrad.type.shape[0] == 1 else None, None, None, None]
-        else:
-            out_shape = [
-                1 if topgrad.type.shape[0] == 1 else None,
-                1 if kern.type.shape[-3] == 1 else None,
-                None,
-                None,
-            ]
-        dtype = kern.type.dtype
-        return Apply(
-            self, [kern, topgrad, *height_width], [TensorType(dtype, shape=out_shape)()]
-        )
-
-    def infer_shape(self, fgraph, node, input_shape):
-        if self.border_mode == "half":
-            padH_l = padH_r = padW_l = padW_r = -1
-        elif self.border_mode == "full":
-            padH_l = padH_r = padW_l = padW_r = -2
-        elif isinstance(self.border_mode, tuple):
-            border = ()
-            for mode in self.border_mode:
-                if isinstance(mode, tuple):
-                    border += ((int(mode[0]), int(mode[1])),)
-                else:
-                    border += ((int(mode), int(mode)),)
-            (padH_l, padH_r), (padW_l, padW_r) = border
-        else:
-            assert self.border_mode == "valid"
-            padH_l = padH_r = padW_l = padW_r = 0
-        dH, dW = self.subsample
-        kshp = input_shape[0]
-        topshp = input_shape[1]
-        ssize, kshp = kshp[-3], list(kshp[-2:])
-        ssize = ssize * self.num_groups
-        bsize, topshp = topshp[0], list(topshp[2:])
-        height_width = node.inputs[-2:]
-        if padH_l == -1 or padH_r == -1:
-            padH_l = padH_r = kshp[0] // 2
-        elif padH_l == -2 or padH_r == -2:
-            padH_l = padH_r = kshp[0] - 1
-        elif padH_l < -2 or padH_r < -2:
-            raise ValueError("CorrMM_gradInputs: border_mode must be >= 0.")
-        if padW_l == -1 or padW_r == -1:
-            padW_l = padW_r = kshp[1] // 2
-        elif padW_l == -2 or padW_r == -2:
-            padW_l = padW_r = kshp[1] - 1
-        elif padW_l < -2 or padW_r < -2:
-            raise ValueError("CorrMM_gradInputs: border_mode must be >= 0.")
-
-        if dH != 1:
-            out_shp0 = height_width[0]
-        else:
-            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - padH_l - padH_r
-        if dW != 1:
-            out_shp1 = height_width[1]
-        else:
-            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - padW_l - padW_r
-        out_shp = (out_shp0, out_shp1)
-        return [(bsize, ssize, *out_shp)]
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        weights, top = inp[:2]
-        height, width = inp[2:] or (None, None)
-        (bottom,) = out_
-        return super().c_code_helper(bottom, weights, top, sub, height, width)
-
-    def grad(self, inp, grads):
-        weights, top = inp[:2]
-        (bottom,) = grads
-        d_weights = CorrMM_gradWeights(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, top, weights.shape[-2:])
-        d_top = CorrMM(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, weights)
-        d_height_width = (
-            (pytensor.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
-        )
-        return (d_weights, d_top, *d_height_width)
-
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0]]  # no connection to height, width
--- a/tests/tensor/conv/test_abstract_conv.py
+++ b/tests/tensor/conv/test_abstract_conv.py
-import numpy as np
-import pytest
-
-import pytensor
-import pytensor.tensor as pt
-from pytensor.compile import get_default_mode
-from pytensor.compile.mode import Mode
-from pytensor.configdefaults import config
-from pytensor.graph.rewriting.basic import check_stack_trace
-from pytensor.link.numba import NumbaLinker
-from pytensor.tensor.conv import abstract_conv
-from pytensor.tensor.conv.abstract_conv import (
-    AbstractConv2d,
-    AbstractConv2d_gradInputs,
-    AbstractConv2d_gradWeights,
-    assert_conv_shape,
-    assert_shape,
-    bilinear_kernel_1D,
-    bilinear_kernel_2D,
-    bilinear_upsampling,
-    causal_conv1d,
-    check_conv_gradinputs_shape,
-    conv2d_transpose,
-    get_conv_gradinputs_shape,
-    get_conv_gradweights_shape,
-    get_conv_output_shape,
-    separable_conv2d,
-    separable_conv3d,
-)
-from pytensor.tensor.type import (
-    TensorType,
-    ftensor4,
-    iscalar,
-    lvector,
-    tensor3,
-    tensor4,
-    tensor5,
-)
-from tests import unittest_tools as utt
-from tests.tensor.conv import c_conv3d_corr3d_ref, c_conv_corr_ref
-
-
-def conv2d_corr(
-    inputs,
-    filters,
-    border_mode="valid",
-    subsample=(1, 1),
-    conv_mode="conv",
-    filter_dilation=(1, 1),
-):
-    if conv_mode == "conv":
-        filters = filters[:, :, ::-1, ::-1]
-    return c_conv_corr_ref.CorrMM(border_mode, subsample, filter_dilation)(
-        inputs, filters
-    )
-
-
-def conv2d_corr_gw(
-    inputs,
-    topgrad,
-    filters_shape,
-    border_mode="valid",
-    subsample=(1, 1),
-    conv_mode="conv",
-    filter_dilation=(1, 1),
-):
-    rval = c_conv_corr_ref.CorrMM_gradWeights(border_mode, subsample, filter_dilation)(
-        inputs, topgrad, filters_shape[2:]
-    )
-    if conv_mode == "conv":
-        rval = rval[:, :, ::-1, ::-1]
-    return rval
-
-
-def conv2d_corr_gi(
-    filters,
-    topgrad,
-    inputs_shape,
-    border_mode="valid",
-    subsample=(1, 1),
-    conv_mode="conv",
-    filter_dilation=(1, 1),
-):
-    if conv_mode == "conv":
-        filters = filters[:, :, ::-1, ::-1]
-    return c_conv_corr_ref.CorrMM_gradInputs(border_mode, subsample, filter_dilation)(
-        filters, topgrad, inputs_shape[2:]
-    )
-
-
-def conv3d_corr(
-    inputs,
-    filters,
-    border_mode="valid",
-    subsample=(1, 1, 1),
-    conv_mode="conv",
-    filter_dilation=(1, 1, 1),
-):
-    if conv_mode == "conv":
-        filters = filters[:, :, ::-1, ::-1, ::-1]
-    return c_conv3d_corr3d_ref.Corr3dMM(border_mode, subsample, filter_dilation)(
-        inputs, filters
-    )
-
-
-def conv3d_corr_gw(
-    inputs,
-    topgrad,
-    filters_shape,
-    border_mode="valid",
-    subsample=(1, 1, 1),
-    conv_mode="conv",
-    filter_dilation=(1, 1, 1),
-):
-    rval = c_conv3d_corr3d_ref.Corr3dMMGradWeights(
-        border_mode, subsample, filter_dilation
-    )(inputs, topgrad, filters_shape[2:])
-    if conv_mode == "conv":
-        rval = rval[:, :, ::-1, ::-1, ::-1]
-    return rval
-
-
-def conv3d_corr_gi(
-    filters,
-    topgrad,
-    inputs_shape,
-    border_mode="valid",
-    subsample=(1, 1, 1),
-    conv_mode="conv",
-    filter_dilation=(1, 1, 1),
-):
-    if conv_mode == "conv":
-        filters = filters[:, :, ::-1, ::-1, ::-1]
-    return c_conv3d_corr3d_ref.Corr3dMMGradInputs(
-        border_mode, subsample, filter_dilation
-    )(filters, topgrad, inputs_shape[2:])
-
-
-class TestGetConvOutShape:
-    def test_basic(self):
-        image_shape, kernel_shape = (3, 2, 12, 9), (4, 2, 5, 6)
-        sub_sample = (1, 2)
-        filter_dilation = (2, 1)
-        test1_params = get_conv_output_shape(
-            image_shape, kernel_shape, "valid", sub_sample, filter_dilation
-        )
-        test2_params = get_conv_output_shape(
-            image_shape, kernel_shape, "half", sub_sample, filter_dilation
-        )
-        test3_params = get_conv_output_shape(
-            image_shape, kernel_shape, "full", sub_sample, filter_dilation
-        )
-        test4_params = get_conv_output_shape(
-            image_shape, kernel_shape, (1, 2), sub_sample, filter_dilation
-        )
-
-        assert test1_params == (3, 4, 4, 2)
-        assert test2_params == (3, 4, 12, 5)
-        assert test3_params == (3, 4, 20, 7)
-        assert test4_params == (3, 4, 6, 4)
-
-    def test_basic_3d(self):
-        image_shape, kernel_shape = (3, 2, 12, 9, 7), (4, 2, 5, 6, 4)
-        sub_sample = (1, 2, 1)
-        filter_dilation = (2, 1, 1)
-        test1_params = get_conv_output_shape(
-            image_shape, kernel_shape, "valid", sub_sample, filter_dilation
-        )
-        test2_params = get_conv_output_shape(
-            image_shape, kernel_shape, "half", sub_sample, filter_dilation
-        )
-        test3_params = get_conv_output_shape(
-            image_shape, kernel_shape, "full", sub_sample, filter_dilation
-        )
-        test4_params = get_conv_output_shape(
-            image_shape, kernel_shape, (1, 2, 3), sub_sample, filter_dilation
-        )
-
-        assert test1_params == (3, 4, 4, 2, 4)
-        assert test2_params == (3, 4, 12, 5, 8)
-        assert test3_params == (3, 4, 20, 7, 10)
-        assert test4_params == (3, 4, 6, 4, 10)
-
-
-class TestConvGradInputsShape:
-    def test_check_shape(self):
-        for i in range(1, 20):
-            for k in range(1, 10):
-                for b in ("valid", "half", "full", (0, 2)):
-                    for s in (1, 2, 3):
-                        for d in (1, 2, 3):
-                            image_shape = (59, 61, i, i)
-                            kernel_shape = (67, 61, k, k)
-
-                            # compute the output that these inputs and parameters would produce
-                            computed_shape = get_conv_output_shape(
-                                image_shape, kernel_shape, b, (s, s), (d, d)
-                            )
-                            # this should be accepted
-                            assert check_conv_gradinputs_shape(
-                                image_shape,
-                                kernel_shape,
-                                computed_shape,
-                                b,
-                                (s, s),
-                                (d, d),
-                            )
-
-                            # one or more None should also be accepted
-                            trial_shape = (None, None, computed_shape[2], None)
-                            assert check_conv_gradinputs_shape(
-                                image_shape,
-                                kernel_shape,
-                                trial_shape,
-                                b,
-                                (s, s),
-                                (d, d),
-                            )
-
-                            # the batch size and number of filters are important
-                            trial_shape = (1, 1, computed_shape[2], computed_shape[3])
-                            assert not check_conv_gradinputs_shape(
-                                image_shape,
-                                kernel_shape,
-                                trial_shape,
-                                b,
-                                (s, s),
-                                (d, d),
-                            )
-
-                            # outputs that are too large or too small should be rejected
-                            for o in (-3, -2, -1, 1, 2, 3):
-                                trial_shape = (
-                                    computed_shape[0],
-                                    computed_shape[1],
-                                    computed_shape[2] + o,
-                                    computed_shape[3] + o,
-                                )
-                                assert not check_conv_gradinputs_shape(
-                                    image_shape,
-                                    kernel_shape,
-                                    trial_shape,
-                                    b,
-                                    (s, s),
-                                    (d, d),
-                                )
-
-    def test_get_shape(self):
-        for i in range(1, 20):
-            for k in range(1, 10):
-                for b in ("valid", "half", "full", (0, 2)):
-                    for d in (1, 2, 3):
-                        image_shape = (59, 61, i, i)
-                        kernel_shape = (67, 61, k, k)
-
-                        # compute the output that these inputs and parameters would produce
-                        output_shape = get_conv_output_shape(
-                            image_shape, kernel_shape, b, (1, 1), (d, d)
-                        )
-
-                        # compute the image_shape given this output_shape
-                        computed_image_shape = get_conv_gradinputs_shape(
-                            kernel_shape, output_shape, b, (1, 1), (d, d)
-                        )
-                        assert computed_image_shape == image_shape
-
-                        # if subsample > 1, the shape should be None
-                        computed_image_shape = get_conv_gradinputs_shape(
-                            kernel_shape, output_shape, b, (2, 3), (d, d)
-                        )
-                        image_shape_with_None = (*image_shape[:2], None, None)
-                        assert computed_image_shape == image_shape_with_None
-
-                        # compute the kernel_shape given this output_shape
-                        computed_kernel_shape = get_conv_gradweights_shape(
-                            image_shape, output_shape, b, (1, 1), (d, d)
-                        )
-
-                        # if border_mode == 'half', the shape should be None
-                        if b == "half":
-                            kernel_shape_with_None = (*kernel_shape[:2], None, None)
-                            assert computed_kernel_shape == kernel_shape_with_None
-                        else:
-                            assert computed_kernel_shape == kernel_shape
-
-                        # if subsample > 1, the shape should be None
-                        computed_kernel_shape = get_conv_gradweights_shape(
-                            kernel_shape, output_shape, b, (2, 3), (d, d)
-                        )
-                        kernel_shape_with_None = (*kernel_shape[:2], None, None)
-                        assert computed_kernel_shape == kernel_shape_with_None
-
-
-class TestAssertConvShape:
-    def test_basic(self):
-        shape = tuple(iscalar() for i in range(4))
-        f = pytensor.function(shape, assert_conv_shape(shape))
-
-        assert [1, 2, 3, 4] == f(1, 2, 3, 4)
-        assert [0, 0, 1, 1] == f(0, 0, 1, 1)
-        with pytest.raises(AssertionError):
-            f(3, 3, 3, 0)
-        with pytest.raises(AssertionError):
-            f(3, 3, 0, 3)
-        with pytest.raises(AssertionError):
-            f(3, 3, -1, 3)
-        with pytest.raises(AssertionError):
-            f(3, -1, 3, 3)
-        with pytest.raises(AssertionError):
-            f(-1, 3, 3, 3)
-
-
-class TestAssertShape:
-    @config.change_flags(conv__assert_shape=True)
-    def test_basic(self):
-        x = tensor4()
-        s1 = iscalar()
-        s2 = iscalar()
-        expected_shape = [None, s1, s2, None]
-        f = pytensor.function([x, s1, s2], assert_shape(x, expected_shape))
-
-        v = np.zeros((3, 5, 7, 11), dtype="float32")
-        assert 0 == np.sum(f(v, 5, 7))
-
-        with pytest.raises(AssertionError):
-            f(v, 5, 0)
-
-        with pytest.raises(AssertionError):
-            f(v, 5, 9)
-
-        with pytest.raises(AssertionError):
-            f(v, 0, 7)
-
-        with pytest.raises(AssertionError):
-            f(v, 7, 7)
-
-    @config.change_flags(conv__assert_shape=True)
-    def test_shape_check_conv2d(self):
-        input = tensor4()
-        filters = tensor4()
-
-        out = abstract_conv.abstract_conv2d(
-            input, filters, input_shape=(3, 5, 7, 11), filter_shape=(7, 5, 3, 3)
-        )
-        f = pytensor.function([input, filters], out)
-        # mismatched input_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 5, 9, 11), dtype="float32"),
-                np.zeros((7, 5, 3, 3), dtype="float32"),
-            )
-        # mismatched filter_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 5, 7, 11), dtype="float32"),
-                np.zeros((7, 5, 2, 2), dtype="float32"),
-            )
-
-    @config.change_flags(conv__assert_shape=True)
-    @pytest.mark.skipif(config.cxx == "", reason="test needs cxx")
-    def test_shape_check_conv3d(self):
-        input = tensor5()
-        filters = tensor5()
-
-        out = abstract_conv.conv3d(
-            input, filters, input_shape=(3, 5, 7, 11, 13), filter_shape=(7, 5, 3, 3, 3)
-        )
-        f = pytensor.function([input, filters], out)
-        # mismatched input_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 5, 9, 11, 13), dtype="float32"),
-                np.zeros((7, 5, 3, 3, 3), dtype="float32"),
-            )
-        # mismatched filter_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 5, 7, 11, 13), dtype="float32"),
-                np.zeros((7, 5, 2, 2, 2), dtype="float32"),
-            )
-
-    @config.change_flags(conv__assert_shape=True)
-    def test_shape_check_conv2d_grad_wrt_inputs(self):
-        output_grad = tensor4()
-        filters = tensor4()
-
-        out = abstract_conv.conv2d_grad_wrt_inputs(
-            output_grad,
-            filters,
-            input_shape=(None, None, 7, 11),
-            filter_shape=(7, 5, 3, 3),
-        )
-        f = pytensor.function([output_grad, filters], out)
-        # mismatched filter_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 6, 5, 9), dtype="float32"),
-                np.zeros((7, 6, 3, 3), dtype="float32"),
-            )
-
-    @config.change_flags(conv__assert_shape=True)
-    @pytest.mark.skipif(config.cxx == "", reason="test needs cxx")
-    def test_shape_check_conv3d_grad_wrt_inputs(self):
-        output_grad = tensor5()
-        filters = tensor5()
-
-        out = abstract_conv.conv3d_grad_wrt_inputs(
-            output_grad,
-            filters,
-            input_shape=(None, None, 7, 11, 13),
-            filter_shape=(7, 5, 3, 3, 3),
-        )
-        f = pytensor.function([output_grad, filters], out)
-        # mismatched filter_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 6, 5, 9, 11), dtype="float32"),
-                np.zeros((7, 6, 3, 3, 3), dtype="float32"),
-            )
-
-    @config.change_flags(conv__assert_shape=True)
-    def test_shape_check_conv2d_grad_wrt_weights(self):
-        input = tensor4()
-        output_grad = tensor4()
-
-        out = abstract_conv.conv2d_grad_wrt_weights(
-            input,
-            output_grad,
-            filter_shape=(None, None, 3, 3),
-            input_shape=(3, 5, 7, 11),
-        )
-        f = pytensor.function([input, output_grad], out)
-        # mismatched filter_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 6, 7, 11), dtype="float32"),
-                np.zeros((3, 7, 5, 9), dtype="float32"),
-            )
-
-    @config.change_flags(conv__assert_shape=True)
-    @pytest.mark.skipif(config.cxx == "", reason="test needs cxx")
-    def test_shape_check_conv3d_grad_wrt_weights(self):
-        input = tensor5()
-        output_grad = tensor5()
-
-        out = abstract_conv.conv3d_grad_wrt_weights(
-            input,
-            output_grad,
-            filter_shape=(None, None, 3, 3, 3),
-            input_shape=(3, 5, 7, 11, 13),
-        )
-        f = pytensor.function([input, output_grad], out)
-        # mismatched filter_shape
-        with pytest.raises(AssertionError):
-            f(
-                np.zeros((3, 6, 7, 11, 13), dtype="float32"),
-                np.zeros((3, 7, 5, 9, 11), dtype="float32"),
-            )
-
-
-class BaseTestConv:
-    def get_output_shape(
-        self, inputs_shape, filters_shape, subsample, border_mode, filter_dilation
-    ):
-        dil_filters = tuple(
-            (s - 1) * d + 1
-            for s, d in zip(filters_shape[2:], filter_dilation, strict=True)
-        )
-        if border_mode == "valid":
-            border_mode = (0,) * (len(inputs_shape) - 2)
-        if border_mode == "half":
-            border_mode = tuple(d // 2 for d in dil_filters)
-        if border_mode == "full":
-            border_mode = tuple(d - 1 for d in dil_filters)
-        batch_size = inputs_shape[0]
-        num_filters = filters_shape[0]
-        return (
-            batch_size,
-            num_filters,
-            *(
-                None
-                if i is None or k is None
-                else (i + 2 * pad - ((k - 1) * fd + 1)) // d + 1
-                for i, k, d, pad, fd in zip(
-                    inputs_shape[2:],
-                    filters_shape[2:],
-                    subsample,
-                    border_mode,
-                    filter_dilation,
-                    strict=True,
-                )
-            ),
-        )
-
-    def run_fwd(
-        self,
-        inputs_shape,
-        filters_shape,
-        conv_fn,
-        conv_op,
-        ref,
-        subsample=None,
-        verify_grad=True,
-        mode=None,
-        border_mode="valid",
-        filter_flip=True,
-        provide_shape=False,
-        target_op=None,
-        check_trace=False,
-        filter_dilation=None,
-    ):
-        if subsample is None:
-            subsample = (1,) * (len(inputs_shape) - 2)
-        if filter_dilation is None:
-            filter_dilation = (1,) * (len(inputs_shape) - 2)
-
-        inputs_val = np.random.random(inputs_shape).astype("float32")
-        filters_val = np.random.random(filters_shape).astype("float32")
-
-        # scale down values to prevent rounding errors
-        inputs_val /= 10
-        filters_val /= 10
-
-        inputs = self.shared(inputs_val)
-        filters = self.shared(filters_val)
-
-        if provide_shape:
-            imshp = inputs_shape
-            kshp = filters_shape
-        else:
-            imshp = None
-            kshp = None
-        if filter_flip:
-            conv_mode = "conv"
-        else:
-            conv_mode = "cross"
-
-        c_ref = ref(
-            inputs,
-            filters,
-            border_mode=border_mode,
-            subsample=subsample,
-            conv_mode=conv_mode,
-            filter_dilation=filter_dilation,
-        )
-        c = conv_fn(
-            inputs,
-            filters,
-            border_mode=border_mode,
-            subsample=subsample,
-            filter_flip=filter_flip,
-            input_shape=imshp,
-            filter_shape=kshp,
-            filter_dilation=filter_dilation,
-        )
-
-        f_ref = pytensor.function([], c_ref, mode="FAST_RUN")
-        f = pytensor.function([], c, mode=mode)
-
-        if target_op is not None:
-            assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort())
-            if check_trace:
-                assert check_stack_trace(f, ops_to_check=target_op)
-
-        res_ref = np.array(f_ref())
-        res = np.array(f())
-        utt.assert_allclose(res_ref, res)
-        if (
-            verify_grad
-            and inputs_val.size > 0
-            and filters_val.size > 0
-            and res.size > 0
-        ):
-            utt.verify_grad(
-                conv_op(
-                    border_mode=border_mode,
-                    imshp=imshp,
-                    kshp=kshp,
-                    subsample=subsample,
-                    filter_dilation=filter_dilation,
-                ),
-                [inputs_val, filters_val],
-                mode=mode,
-            )
-
-    def run_gradweight(
-        self,
-        inputs_shape,
-        filters_shape,
-        output_shape,
-        gradWeights_fn,
-        ref,
-        subsample=None,
-        filter_flip=True,
-        verify_grad=True,
-        mode=None,
-        border_mode="valid",
-        provide_shape=False,
-        target_op=None,
-        check_trace=False,
-        filter_dilation=None,
-    ):
-        if subsample is None:
-            subsample = (1,) * (len(inputs_shape) - 2)
-        if filter_dilation is None:
-            filter_dilation = (1,) * (len(inputs_shape) - 2)
-
-        inputs_val = np.random.random(inputs_shape).astype("float32")
-        output_val = np.random.random(output_shape).astype("float32")
-
-        inputs = self.shared(inputs_val)
-        output = self.shared(output_val)
-
-        if provide_shape:
-            imshp = inputs_shape
-            kshp = filters_shape
-        else:
-            imshp = None
-            kshp = None
-        if filter_flip:
-            conv_mode = "conv"
-        else:
-            conv_mode = "cross"
-        c = gradWeights_fn(
-            border_mode=border_mode,
-            filter_flip=filter_flip,
-            subsample=subsample,
-            imshp=imshp,
-            kshp=kshp,
-            filter_dilation=filter_dilation,
-        )
-        c = c(inputs, output, filters_shape[2:])
-        c_ref = ref(
-            inputs,
-            output,
-            filters_shape,
-            border_mode=border_mode,
-            subsample=subsample,
-            conv_mode=conv_mode,
-            filter_dilation=filter_dilation,
-        )
-        f = pytensor.function([], c, mode=mode)
-        f_ref = pytensor.function([], c_ref, mode="FAST_RUN")
-
-        if target_op is not None:
-            assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort())
-            if check_trace:
-                assert check_stack_trace(f, ops_to_check=target_op)
-
-        res_ref = np.array(f_ref())
-        res = np.array(f())
-        utt.assert_allclose(res_ref, res)
-
-        def abstract_conv_gradweight(inputs_val, output_val):
-            conv_op = gradWeights_fn(
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            return conv_op(inputs_val, output_val, filters_shape[2:])
-
-        if verify_grad and inputs_val.size > 0 and output_val.size > 0 and res.size > 0:
-            utt.verify_grad(
-                abstract_conv_gradweight, [inputs_val, output_val], mode=mode, eps=1
-            )
-
-    def run_gradinput(
-        self,
-        inputs_shape,
-        filters_shape,
-        output_shape,
-        gradInputs_fn,
-        ref,
-        subsample=None,
-        filter_flip=True,
-        verify_grad=True,
-        mode=None,
-        border_mode="valid",
-        provide_shape=False,
-        target_op=None,
-        check_trace=False,
-        filter_dilation=None,
-    ):
-        if subsample is None:
-            subsample = (1,) * (len(inputs_shape) - 2)
-        if filter_dilation is None:
-            filter_dilation = (1,) * (len(inputs_shape) - 2)
-
-        output_val = np.random.random(output_shape).astype("float32")
-        filters_val = np.random.random(filters_shape).astype("float32")
-        output = self.shared(output_val)
-        filters = self.shared(filters_val)
-
-        if provide_shape:
-            imshp = inputs_shape
-            kshp = filters_shape
-        else:
-            imshp = None
-            kshp = None
-        if filter_flip:
-            conv_mode = "conv"
-        else:
-            conv_mode = "cross"
-        c = gradInputs_fn(
-            border_mode=border_mode,
-            subsample=subsample,
-            filter_flip=filter_flip,
-            imshp=imshp,
-            kshp=kshp,
-            filter_dilation=filter_dilation,
-        )
-        c = c(filters, output, inputs_shape[2:])
-        f = pytensor.function([], c, mode=mode)
-
-        # ref is set to None for the inconsistent-shape tests.
-        # The reference function also raises an exception, which would
-        # mask the exception generated by the target implementation.
-        if ref is not None:
-            c_ref = ref(
-                filters,
-                output,
-                inputs_shape,
-                border_mode=border_mode,
-                subsample=subsample,
-                conv_mode=conv_mode,
-                filter_dilation=filter_dilation,
-            )
-            f_ref = pytensor.function([], c_ref, mode="FAST_RUN")
-
-        if target_op is not None:
-            assert any(isinstance(n.op, target_op) for n in f.maker.fgraph.toposort())
-            if check_trace:
-                assert check_stack_trace(f, ops_to_check=target_op)
-
-        res = np.array(f())
-
-        if ref is not None:
-            res_ref = np.array(f_ref())
-            utt.assert_allclose(res_ref, res)
-
-        def abstract_conv_gradinputs(filters_val, output_val):
-            conv_op = gradInputs_fn(
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            return conv_op(filters_val, output_val, inputs_shape[2:])
-
-        if (
-            verify_grad
-            and filters_val.size > 0
-            and output_val.size > 0
-            and res.size > 0
-        ):
-            utt.verify_grad(
-                abstract_conv_gradinputs, [filters_val, output_val], mode=mode, eps=1
-            )
-
-    def run_test_case(self, *args, **kargs):
-        raise NotImplementedError()
-
-    @pytest.mark.xfail(
-        condition=isinstance(get_default_mode().linker, NumbaLinker),
-        reason="Involves Ops with no Python implementation for numba to use as fallback",
-    )
-    def test_all(self):
-        ds = self.default_subsamples
-        db = self.default_border_mode
-        dflip = self.default_filter_flip
-        dprovide_shape = self.default_provide_shape
-        for i, f in zip(self.inputs_shapes, self.filters_shapes, strict=True):
-            for provide_shape in self.provide_shape:
-                self.run_test_case(i, f, ds, db, dflip, provide_shape)
-            if min(i) > 0 and min(f) > 0:
-                for fd in self.filters_dilations:
-                    for s in self.subsamples:
-                        for b in self.border_modes:
-                            self.run_test_case(i, f, s, b, dflip, dprovide_shape, fd)
-                for flip in self.filter_flip:
-                    self.run_test_case(i, f, ds, db, flip, dprovide_shape)
-
-
-class BaseTestConv2d(BaseTestConv):
-    @classmethod
-    def setup_class(cls):
-        # This tests can run even when config.blas__ldflags is empty.
-        cls.inputs_shapes = [
-            (8, 1, 6, 6),
-            (8, 1, 8, 8),
-            (2, 1, 7, 7),
-            (6, 1, 10, 11),
-            (2, 1, 6, 5),
-            (1, 5, 9, 9),
-            (0, 1, 6, 6),
-            (1, 0, 6, 6),
-            (1, 1, 6, 6),
-        ]
-        cls.filters_shapes = [
-            (5, 1, 2, 2),
-            (4, 1, 3, 3),
-            (2, 1, 3, 3),
-            (1, 1, 2, 3),
-            (4, 1, 1, 3),
-            (4, 5, 3, 2),
-            (1, 1, 2, 2),
-            (1, 0, 2, 2),
-            (0, 1, 2, 2),
-        ]
-        cls.subsamples = [(1, 1), (2, 2), (2, 4)]
-        cls.default_subsamples = (1, 1)
-        cls.filters_dilations = [(1, 1), (1, 2), (2, 1)]
-        cls.default_filters_dilations = (1, 1)
-        cls.border_modes = ["valid", "half", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
-        cls.default_border_mode = (0, 0)
-        cls.filter_flip = [True, False]
-        cls.default_filter_flip = True
-        cls.provide_shape = [True, False]
-        cls.default_provide_shape = True
-        cls.shared = staticmethod(pytensor.compile.shared)
-
-    def run_test_case_gi(self, *args, **kwargs):
-        raise NotImplementedError()
-
-    @pytest.mark.xfail(
-        condition=isinstance(get_default_mode().linker, NumbaLinker),
-        reason="Involves Ops with no Python implementation for numba to use as fallback",
-    )
-    def test_gradinput_arbitrary_output_shapes(self):
-        # this computes the grad wrt inputs for an output shape
-        # that the forward convolution would not produce
-        input_shape = (2, 1, 7, 7)
-        filter_shape = (2, 1, 3, 3)
-        for output_shape in [(2, 2, 8, 8), (2, 2, 9, 9), (2, 2, 12, 12)]:
-            for border_mode in ["valid", "half", "full"]:
-                computed_shape = get_conv_output_shape(
-                    input_shape,
-                    filter_shape,
-                    border_mode,
-                    self.default_subsamples,
-                    self.default_filters_dilations,
-                )
-                # is this a valid combination?
-                if tuple(computed_shape) == output_shape:
-                    self.run_test_case_gi(
-                        input_shape,
-                        filter_shape,
-                        output_shape,
-                        self.default_subsamples,
-                        border_mode,
-                        True,
-                        True,
-                        self.default_filters_dilations,
-                        False,
-                    )
-                else:
-                    # expect an error
-                    self.run_test_case_gi(
-                        input_shape,
-                        filter_shape,
-                        output_shape,
-                        self.default_subsamples,
-                        border_mode,
-                        True,
-                        True,
-                        self.default_filters_dilations,
-                        True,
-                    )
-
-    def test_gradinput_impossible_output_shapes(self):
-        def run_for_output_offsets(image_shape, kernel_shape, s, border_mode, d):
-            # outputs that are too large or too small should be rejected
-            for o in (-3, -1, 1, 2):
-                output_shape = (1, 1, computed_shape[2] + o, computed_shape[3] + o)
-                # expect an error
-                self.run_test_case_gi(
-                    image_shape,
-                    kernel_shape,
-                    output_shape,
-                    (s, s),
-                    border_mode,
-                    True,
-                    True,
-                    (d, d),
-                    True,
-                )
-
-        for i, k in ((1, 1), (1, 2), (2, 1), (4, 2), (4, 3), (7, 3), (9, 5)):
-            for border_mode in ("valid", "half", "full", (0, 2)):
-                for s, d in ((1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (1, 3)):
-                    image_shape = (1, 1, i, i)
-                    kernel_shape = (1, 1, k, k)
-
-                    # compute the output that these inputs and parameters would produce
-                    computed_shape = get_conv_output_shape(
-                        image_shape, kernel_shape, border_mode, (s, s), (d, d)
-                    )
-
-                    run_for_output_offsets(
-                        image_shape,
-                        kernel_shape,
-                        s,
-                        border_mode,
-                        d,
-                    )
-
-    def run_fwd(
-        self,
-        inputs_shape,
-        filters_shape,
-        conv_fn=abstract_conv.abstract_conv2d,
-        conv_op=abstract_conv.AbstractConv2d,
-        ref=conv2d_corr,
-        **kwargs,
-    ):
-        super().run_fwd(
-            inputs_shape=inputs_shape,
-            filters_shape=filters_shape,
-            conv_fn=conv_fn,
-            conv_op=conv_op,
-            ref=ref,
-            **kwargs,
-        )
-
-    def run_gradweight(
-        self,
-        inputs_shape,
-        filters_shape,
-        output_shape,
-        gradWeights_fn=abstract_conv.AbstractConv2d_gradWeights,
-        ref=conv2d_corr_gw,
-        **kwargs,
-    ):
-        super().run_gradweight(
-            inputs_shape=inputs_shape,
-            filters_shape=filters_shape,
-            output_shape=output_shape,
-            gradWeights_fn=gradWeights_fn,
-            ref=ref,
-            **kwargs,
-        )
-
-    def run_gradinput(
-        self,
-        inputs_shape,
-        filters_shape,
-        output_shape,
-        gradInputs_fn=abstract_conv.AbstractConv2d_gradInputs,
-        ref=conv2d_corr_gi,
-        **kwargs,
-    ):
-        super().run_gradinput(
-            inputs_shape=inputs_shape,
-            filters_shape=filters_shape,
-            output_shape=output_shape,
-            gradInputs_fn=gradInputs_fn,
-            ref=ref,
-            **kwargs,
-        )
-
-
-@pytest.mark.skipif(config.cxx == "", reason="cxx needed")
-class TestAbstractConvNoOptim(BaseTestConv2d):
-    @classmethod
-    def setup_class(cls):
-        # This tests can run even when config.blas__ldflags is empty.
-        super().setup_class()
-        cls.inputs_shapes = [(8, 1, 6, 6)]
-        cls.filters_shapes = [(5, 1, 2, 2)]
-        cls.subsamples = [(1, 1), (2, 2)]
-        cls.filters_dilations = [(1, 1), (1, 2), (2, 1)]
-        cls.border_modes = ["valid", "half", "full"]
-        cls.filter_flip = [True]
-        cls.provide_shape = [False]
-
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
-        o = self.get_output_shape(i, f, s, b, fd)
-        mode = Mode(optimizer=None)
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=None,
-            check_trace=True,
-            filter_dilation=fd,
-            mode=mode,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=None,
-            check_trace=True,
-            filter_dilation=fd,
-            mode=mode,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=None,
-            check_trace=True,
-            filter_dilation=fd,
-            mode=mode,
-        )
-
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
-    ):
-        mode = Mode(optimizer=None)
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=None,
-                check_trace=True,
-                filter_dilation=fd,
-                mode=mode,
-            )
-        else:
-            with pytest.raises(ValueError):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=None,
-                    check_trace=True,
-                    filter_dilation=fd,
-                    ref=None,
-                    mode=mode,
-                )
-
-
-class BaseTestConv3d(BaseTestConv):
-    @classmethod
-    def setup_class(cls):
-        # This tests can run even when config.blas__ldflags is empty.
-        cls.inputs_shapes = [
-            (2, 1, 5, 5, 5),
-            (1, 2, 7, 5, 6),
-            (0, 1, 5, 5, 5),
-            (1, 0, 5, 5, 5),
-            (1, 1, 5, 5, 5),
-        ]
-        cls.filters_shapes = [
-            (2, 1, 2, 2, 2),
-            (1, 2, 2, 1, 3),
-            (1, 1, 2, 2, 2),
-            (1, 0, 2, 2, 2),
-            (0, 1, 2, 2, 2),
-        ]
-        cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
-        cls.default_subsamples = (1, 1, 1)
-        cls.filters_dilations = [(1, 1, 1), (1, 2, 1), (2, 1, 2)]
-        cls.default_filters_dilations = (1, 1, 1)
-        cls.border_modes = ["valid", "half", "full", (0, 0, 0), (2, 2, 3)]
-        cls.default_border_mode = (0, 0, 0)
-        cls.filter_flip = [True, False]
-        cls.default_filter_flip = True
-        cls.provide_shape = [True, False]
-        cls.default_provide_shape = True
-        cls.shared = staticmethod(pytensor.compile.shared)
-
-    def test_gradinput_arbitrary_output_shapes(self):
-        # this computes the grad wrt inputs for an output shape
-        # that the forward convolution would not produce
-        input_shape = (2, 1, 7, 7, 7)
-        filter_shape = (1, 1, 3, 3, 3)
-        for output_shape in [(2, 1, 8, 8, 8), (2, 1, 9, 9, 9), (2, 1, 12, 12, 12)]:
-            for border_mode in ["valid", "half", "full"]:
-                # compute the output that these inputs and parameters would produce
-                computed_shape = get_conv_output_shape(
-                    input_shape,
-                    filter_shape,
-                    border_mode,
-                    self.default_subsamples,
-                    self.default_filters_dilations,
-                )
-                # is this a valid combination?
-                if tuple(computed_shape) == output_shape:
-                    self.run_test_case_gi(
-                        input_shape,
-                        filter_shape,
-                        output_shape,
-                        self.default_subsamples,
-                        border_mode,
-                        True,
-                        True,
-                        self.default_filters_dilations,
-                        False,
-                    )
-                else:
-                    # expect an error
-                    self.run_test_case_gi(
-                        input_shape,
-                        filter_shape,
-                        output_shape,
-                        self.default_subsamples,
-                        border_mode,
-                        True,
-                        True,
-                        self.default_filters_dilations,
-                        True,
-                    )
-
-    def test_gradinput_impossible_output_shapes(self):
-        def run_for_output_offsets(image_shape, kernel_shape, s, border_mode, d):
-            # outputs that are too large or too small should be rejected
-            for o in (-3, -1, 1, 2):
-                output_shape = (
-                    1,
-                    1,
-                    computed_shape[2] + o,
-                    computed_shape[3] + o,
-                    computed_shape[4] + o,
-                )
-                # expect an error
-                self.run_test_case_gi(
-                    image_shape,
-                    kernel_shape,
-                    output_shape,
-                    (s, s),
-                    border_mode,
-                    True,
-                    True,
-                    (d, d),
-                    True,
-                )
-
-        for i, k in ((1, 1), (1, 2), (2, 1), (4, 2), (4, 3), (7, 3), (9, 5)):
-            for border_mode in ("valid", "half", "full", (0, 2, 1)):
-                for s, d in ((1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (1, 3)):
-                    image_shape = (1, 1, i, i, i)
-                    kernel_shape = (1, 1, k, k, k)
-
-                    # compute the output that these inputs and parameters would produce
-                    computed_shape = get_conv_output_shape(
-                        image_shape, kernel_shape, border_mode, (s, s, s), (d, d, d)
-                    )
-
-                    run_for_output_offsets(
-                        image_shape,
-                        kernel_shape,
-                        s,
-                        border_mode,
-                        d,
-                    )
-
-    def run_fwd(
-        self,
-        inputs_shape,
-        filters_shape,
-        conv_fn=abstract_conv.conv3d,
-        conv_op=abstract_conv.AbstractConv3d,
-        ref=conv3d_corr,
-        **kwargs,
-    ):
-        super().run_fwd(
-            inputs_shape=inputs_shape,
-            filters_shape=filters_shape,
-            conv_fn=conv_fn,
-            conv_op=conv_op,
-            ref=ref,
-            **kwargs,
-        )
-
-    def run_gradweight(
-        self,
-        inputs_shape,
-        filters_shape,
-        output_shape,
-        gradWeights_fn=abstract_conv.AbstractConv3d_gradWeights,
-        ref=conv3d_corr_gw,
-        **kwargs,
-    ):
-        super().run_gradweight(
-            inputs_shape=inputs_shape,
-            filters_shape=filters_shape,
-            output_shape=output_shape,
-            gradWeights_fn=gradWeights_fn,
-            ref=ref,
-            **kwargs,
-        )
-
-    def run_gradinput(
-        self,
-        inputs_shape,
-        filters_shape,
-        output_shape,
-        gradInputs_fn=abstract_conv.AbstractConv3d_gradInputs,
-        ref=conv3d_corr_gi,
-        **kwargs,
-    ):
-        super().run_gradinput(
-            inputs_shape=inputs_shape,
-            filters_shape=filters_shape,
-            output_shape=output_shape,
-            gradInputs_fn=gradInputs_fn,
-            ref=ref,
-            **kwargs,
-        )
-
-
-def test_constant_shapes():
-    # Check that the `imshp` and `kshp` parameters of the AbstractConv Ops
-    # are rejected if not constant or None
-    dummy_t4 = ftensor4()
-    alloc_dummy_t4 = pt.zeros((3, 5, 7, 11), dtype="float32")
-
-    dummy_shape = lvector()
-    dummy_one_shape = pt.ones(4, dtype="int64")
-    constant_vec_shape = pt.constant([3, 5, 7, 11])
-
-    tuple_shape = (3, 5, 7, 11)
-    list_shape = list(tuple_shape)
-    constant_list_shape = [pt.constant(i, dtype="int64") for i in tuple_shape]
-    constant_tuple_shape = tuple(constant_list_shape)
-
-    bad_shapes = (
-        dummy_shape,
-        dummy_one_shape,
-        dummy_t4.shape,
-        alloc_dummy_t4.shape,
-        constant_vec_shape,
-    )
-
-    good_shapes = (constant_list_shape, constant_tuple_shape, tuple_shape, list_shape)
-
-    ops_to_test = (
-        AbstractConv2d,
-        AbstractConv2d_gradInputs,
-        AbstractConv2d_gradWeights,
-    )
-
-    for op in ops_to_test:
-        for shp in bad_shapes:
-            with pytest.raises(ValueError):
-                op(imshp=shp)
-            with pytest.raises(ValueError):
-                op(kshp=shp)
-
-        for shp in good_shapes:
-            op(imshp=shp)
-            op(kshp=shp)
-
-
-class TestConvTypes:
-    def setup_method(self):
-        self.input = ftensor4()
-        self.filters = ftensor4()
-        self.topgrad = ftensor4()
-
-        self.constant_tensor = np.zeros((3, 5, 7, 11), dtype="float32")
-
-    def test_grad_types(self):
-        # This function simply tests the behaviour of the AbstractConv
-        # Ops, not their optimizations
-        input = self.input
-        filters = self.filters
-        topgrad = self.topgrad
-
-        out_shape = lvector()
-
-        output = abstract_conv.abstract_conv2d(input, filters)
-        grad_input, grad_filters = pytensor.grad(output.sum(), wrt=(input, filters))
-        assert grad_input.type == input.type, (
-            grad_input,
-            grad_input.type,
-            input,
-            input.type,
-        )
-        assert grad_filters.type == filters.type, (
-            grad_filters,
-            grad_filters.type,
-            filters,
-            filters.type,
-        )
-
-        grad_filters = abstract_conv.AbstractConv2d_gradWeights()(
-            input, topgrad, out_shape
-        )
-        grad_input, grad_topgrad = pytensor.grad(
-            grad_filters.sum(), wrt=(input, topgrad)
-        )
-
-        assert grad_input.type == input.type, (
-            grad_input,
-            grad_input.type,
-            input,
-            input.type,
-        )
-        assert grad_topgrad.type == topgrad.type, (
-            grad_topgrad,
-            grad_topgrad.type,
-            topgrad,
-            topgrad.type,
-        )
-
-        grad_input = abstract_conv.AbstractConv2d_gradInputs()(
-            filters, topgrad, out_shape
-        )
-        grad_filters, grad_topgrad = pytensor.grad(
-            grad_input.sum(), wrt=(filters, topgrad)
-        )
-
-        assert grad_filters.type == filters.type, (
-            grad_filters,
-            grad_filters.type,
-            filters,
-            filters.type,
-        )
-        assert grad_topgrad.type == topgrad.type, (
-            grad_topgrad,
-            grad_topgrad.type,
-            topgrad,
-            topgrad.type,
-        )
-
-    def test_constant_input(self):
-        # Check the AbstractConv Ops for constant inputs
-        input = self.input
-        filters = self.filters
-        topgrad = self.topgrad
-        constant_tensor = self.constant_tensor
-        out_shape = lvector()
-
-        # Check the forward Op
-        output = abstract_conv.abstract_conv2d(constant_tensor, filters)
-        grad_filters = pytensor.grad(output.sum(), wrt=filters)
-        assert filters.type.is_super(grad_filters.type), (
-            grad_filters,
-            grad_filters.type,
-            filters,
-            filters.type,
-        )
-
-        output = abstract_conv.abstract_conv2d(input, constant_tensor)
-        grad_input = pytensor.grad(output.sum(), wrt=input)
-        assert input.type.is_super(grad_input.type), (
-            grad_input,
-            grad_input.type,
-            input,
-            input.type,
-        )
-
-        # Check grad wrt weights
-        grad_filters = abstract_conv.AbstractConv2d_gradWeights()(
-            constant_tensor, topgrad, out_shape
-        )
-        grad_topgrad = pytensor.grad(grad_filters.sum(), wrt=topgrad)
-        assert topgrad.type.is_super(grad_topgrad.type), (
-            grad_topgrad,
-            grad_topgrad.type,
-            topgrad,
-            topgrad.type,
-        )
-
-        grad_filters = abstract_conv.AbstractConv2d_gradWeights()(
-            input, constant_tensor, out_shape
-        )
-        grad_input = pytensor.grad(grad_filters.sum(), wrt=input)
-        assert grad_input.type == input.type, (
-            grad_input,
-            grad_input.type,
-            input,
-            input.type,
-        )
-
-        # Check grad wrt inputs
-        grad_input = abstract_conv.AbstractConv2d_gradInputs()(
-            constant_tensor, topgrad, out_shape
-        )
-        grad_topgrad = pytensor.grad(grad_input.sum(), wrt=topgrad)
-        assert topgrad.type.is_super(grad_topgrad.type), (
-            grad_topgrad,
-            grad_topgrad.type,
-            topgrad,
-            topgrad.type,
-        )
-
-        grad_input = abstract_conv.AbstractConv2d_gradInputs()(
-            filters, constant_tensor, out_shape
-        )
-        grad_filters = pytensor.grad(grad_input.sum(), wrt=filters)
-        assert grad_filters.type == filters.type, (
-            grad_filters,
-            grad_filters.type,
-            filters,
-            filters.type,
-        )
-
-
-class TestBilinearUpsampling:
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    compile_mode = pytensor.compile.mode.get_default_mode()
-    if config.mode == "FAST_COMPILE":
-        compile_mode = compile_mode.excluding("conv_gemm")
-        compile_mode = compile_mode.excluding("AbstractConvCheck")
-    elif not config.cxx:
-        compile_mode = compile_mode.excluding("AbstractConvCheck")
-
-    def numerical_kernel_1D(self, ratio):
-        """
-        Gets numerical 1D kernel for bilinear upsampling
-        """
-        return np.array(list(range(1, ratio + 1)) + list(range(ratio - 1, 0, -1)))
-
-    def numerical_kernel_2D(self, ratio):
-        """
-        Gets numerical 2D kernel for bilinear upsampling
-        """
-        return np.array(
-            [
-                i * j
-                for i in self.numerical_kernel_1D(ratio)
-                for j in self.numerical_kernel_1D(ratio)
-            ]
-        ).reshape(2 * ratio - 1, 2 * ratio - 1)
-
-    def test_bilinear_kernel_2D(self):
-        # Test 2D kernels used in bilinear upsampling
-        #
-        # This method tests the correctness of the
-        # 2D kernel values used in bilinear upsampling
-        # for some upsampling ratios.
-
-        for ratio in [2, 3, 4, 5, 6, 7, 8, 9]:
-            # getting the un-normalized kernel
-            kernel = bilinear_kernel_2D(ratio=ratio, normalize=False)
-            f = pytensor.function([], kernel)
-            kernel_2D = self.numerical_kernel_2D(ratio)
-            utt.assert_allclose(kernel_2D, f())
-
-            # getting the normalized kernel
-            kernel = bilinear_kernel_2D(ratio=ratio, normalize=True)
-            f = pytensor.function([], kernel)
-            kernel_2D = kernel_2D / float(ratio**2)
-            utt.assert_allclose(kernel_2D, f())
-
-    def test_bilinear_kernel_1D(self):
-        # Test 1D kernels used in bilinear upsampling
-        #
-        # This method tests the correctness of the
-        # 1D kernel values used in bilinear upsampling
-        # for some upsampling ratios.
-
-        rat = iscalar()
-        kernel_ten = bilinear_kernel_1D(ratio=rat, normalize=False)
-        f_ten = pytensor.function([rat], kernel_ten)
-
-        kernel_ten_norm = bilinear_kernel_1D(ratio=rat, normalize=True)
-        f_ten_norm = pytensor.function([rat], kernel_ten_norm)
-
-        for ratio in [2, 3, 4, 5, 6, 7, 8, 9]:
-            # getting the un-normalized kernel
-            kernel = bilinear_kernel_1D(ratio=ratio, normalize=False)
-            f = pytensor.function([], kernel)
-            kernel_1D = self.numerical_kernel_1D(ratio)
-            utt.assert_allclose(kernel_1D, f())
-            utt.assert_allclose(kernel_1D, f_ten(ratio))
-
-            # getting the normalized kernel
-            kernel = bilinear_kernel_1D(ratio=ratio, normalize=True)
-            f = pytensor.function([], kernel)
-            kernel_1D = kernel_1D / float(ratio)
-            utt.assert_allclose(kernel_1D, f())
-            utt.assert_allclose(kernel_1D, f_ten_norm(ratio))
-
-    def numerical_upsampling_multiplier(self, ratio):
-        """
-        Compute upsampling multiplier
-
-        This method computes the multipliers of an array
-        that will be upsampled using bilinear interpolation.
-
-        Parameters
-        ----------
-        ratio: int
-            the ratio by which the array will be upsampled.
-
-        Returns
-        -------
-        1D numpy array
-            The multipliers that can be used in bilinear interpolation
-            to upsample an array.
-
-        int
-            The size of the multipliers array
-        """
-        kern = np.arange(ratio + 1)
-        return kern, kern.shape[0]
-
-    def get_upsampled_twobytwo_mat(self, two_by_two, ratio):
-        """
-        Upsample 4D array with two rows and two columns
-
-        This method gets a 4D numpy array with two rows and two columns
-        and computes its upsampled array by using bilinear interpolation
-
-        Parameters
-        ----------
-        two_by_two: numpy 4D array
-            The array that will be upsampled by bilinear interpolation.
-            Array is of shape (batch size, num channels, 2, 2)
-
-        ratio: int
-            The ratio by which two_by_two's last
-            two dimensions (row and col) will be upsampled.
-
-        Returns
-        -------
-        4D numpy array
-            The array upsampled by using bilinear interpolation. Array
-            is of shape (batch size, num channels, 2*ratio, 2*ratio).
-        """
-        kern, _shp = self.numerical_upsampling_multiplier(ratio)
-        up_1D = two_by_two[:, :, :, :1] * kern[::-1] + two_by_two[:, :, :, 1:] * kern
-        up_2D = (
-            up_1D[:, :, :1, :] * kern[::-1][:, np.newaxis]
-            + up_1D[:, :, 1:, :] * kern[:, np.newaxis]
-        )
-        num_concat = (ratio - 1) // 2
-        for i in range(num_concat):
-            up_2D = np.concatenate([up_2D[:, :, :1, :], up_2D], axis=2)
-            up_2D = np.concatenate([up_2D, up_2D[:, :, -1:, :]], axis=2)
-            up_2D = np.concatenate([up_2D[:, :, :, :1], up_2D], axis=3)
-            up_2D = np.concatenate([up_2D, up_2D[:, :, :, -1:]], axis=3)
-        if ratio % 2 == 0:
-            up_2D = np.concatenate([up_2D, up_2D[:, :, -1:, :]], axis=2)
-            up_2D = np.concatenate([up_2D, up_2D[:, :, :, -1:]], axis=3)
-        return up_2D / float(ratio) ** 2
-
-    def test_bilinear_upsampling_1D(self):
-        # Test bilinear upsampling using 1D kernels
-        #
-        # This method tests the bilinear_upsampling method
-        # when using 1D kernels for some upsampling ratios.
-
-        # upsampling for a ratio of two
-        input_x = np.array([[[[1, 2], [3, 4]]]], dtype=config.floatX)
-
-        for ratio in [2, 3, 4, 5, 6, 7, 8, 9]:
-            bilin_mat = bilinear_upsampling(
-                input=input_x,
-                ratio=ratio,
-                batch_size=1,
-                num_input_channels=1,
-                use_1D_kernel=True,
-            )
-            f = pytensor.function([], bilin_mat, mode=self.compile_mode)
-            up_mat_2d = self.get_upsampled_twobytwo_mat(input_x, ratio)
-            utt.assert_allclose(f(), up_mat_2d, rtol=1e-06)
-
-    def test_bilinear_upsampling_reshaping(self):
-        # Test bilinear upsampling without giving shape information
-        #
-        # This method tests the bilinear_upsampling method
-        # without giving batch_size and num_input_channels
-
-        # upsampling for a ratio of two
-        input_x = np.array([[[[1, 2], [3, 4]]]], dtype=config.floatX)
-
-        for ratio in [2, 3]:
-            for use_1D_kernel in [True, False]:
-                bilin_mat = bilinear_upsampling(
-                    input=input_x,
-                    ratio=ratio,
-                    batch_size=None,
-                    num_input_channels=None,
-                    use_1D_kernel=use_1D_kernel,
-                )
-                f = pytensor.function([], bilin_mat, mode=self.compile_mode)
-                up_mat_2d = self.get_upsampled_twobytwo_mat(input_x, ratio)
-                utt.assert_allclose(f(), up_mat_2d, rtol=1e-06)
-
-    def test_compare_1D_and_2D_upsampling_values(self):
-        # Compare 1D and 2D upsampling
-        #
-        # This method verifies the bilinear upsampling done by using
-        # 1D and 2D kernels will generate the same result.
-
-        # checking upsampling with ratio 5
-        rng = np.random.default_rng(280284)
-
-        input_x = rng.random((5, 4, 6, 7)).astype(config.floatX)
-        mat_1D = bilinear_upsampling(
-            input=input_x,
-            ratio=5,
-            batch_size=5,
-            num_input_channels=4,
-            use_1D_kernel=True,
-        )
-        mat_2D = bilinear_upsampling(
-            input=input_x,
-            ratio=5,
-            batch_size=5,
-            num_input_channels=4,
-            use_1D_kernel=False,
-        )
-        f_1D = pytensor.function([], mat_1D, mode=self.compile_mode)
-        f_2D = pytensor.function([], mat_2D, mode=self.compile_mode)
-        utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)
-
-        # checking upsampling with ratio 8
-        input_x = rng.random((12, 11, 10, 7)).astype(config.floatX)
-        mat_1D = bilinear_upsampling(
-            input=input_x,
-            ratio=8,
-            batch_size=12,
-            num_input_channels=11,
-            use_1D_kernel=True,
-        )
-        mat_2D = bilinear_upsampling(
-            input=input_x,
-            ratio=8,
-            batch_size=12,
-            num_input_channels=11,
-            use_1D_kernel=False,
-        )
-        f_1D = pytensor.function([], mat_1D, mode=self.compile_mode)
-        f_2D = pytensor.function([], mat_2D, mode=self.compile_mode)
-        utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)
-
-    def test_fractional_bilinear_upsampling(self):
-        """Test bilinear upsampling with nonsimilar fractional
-        row and col ratios
-        """
-        input_x = np.array(
-            [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], ndmin=4
-        ).astype(config.floatX)
-        up_x = bilinear_upsampling(
-            input=input_x, frac_ratio=((7, 4), (5, 3)), use_1D_kernel=False
-        )
-        num_up_x = np.array(
-            [
-                [
-                    [
-                        [1.0, 1.2, 1.8, 2.0],
-                        [1.28571429, 1.48571429, 2.08571429, 2.28571429],
-                        [2.42857143, 2.62857143, 3.22857143, 3.42857143],
-                        [3.0, 3.2, 3.8, 4.0],
-                    ],
-                    [
-                        [5.0, 5.2, 5.8, 6.0],
-                        [5.28571429, 5.48571429, 6.08571429, 6.28571429],
-                        [6.42857143, 6.62857143, 7.22857143, 7.42857143],
-                        [7.0, 7.2, 7.8, 8.0],
-                    ],
-                    [
-                        [9.0, 9.2, 9.8, 10.0],
-                        [9.28571429, 9.48571429, 10.08571429, 10.28571429],
-                        [10.42857143, 10.62857143, 11.22857143, 11.42857143],
-                        [11.0, 11.2, 11.8, 12.0],
-                    ],
-                ]
-            ]
-        ).astype(config.floatX)
-        f_up_x = pytensor.function([], up_x, mode=self.compile_mode)
-        utt.assert_allclose(f_up_x(), num_up_x, rtol=1e-6)
-
-    def test_fractional_bilinear_upsampling_shape(self):
-        x = np.random.random((1, 1, 200, 200)).astype(config.floatX)
-        resize = (24, 20)
-        z = bilinear_upsampling(
-            pt.as_tensor_variable(x), frac_ratio=resize, use_1D_kernel=False
-        )
-        out = pytensor.function([], z.shape, mode="FAST_RUN")()
-        utt.assert_allclose(out, (1, 1, 240, 240))
-
-
-class TestConv2dTranspose:
-    mode = None
-
-    @pytest.mark.skipif(config.cxx == "", reason="Test needs cxx")
-    def test_interface(self):
-        # Test conv2d_transpose wrapper.
-        #
-        # This method tests that the order of the filter's
-        # axes expected by the function produces the correct
-        # output shape.
-        mode = self.mode
-        if config.mode == "FAST_COMPILE":
-            mode = (
-                pytensor.compile.get_mode(mode)
-                .excluding("conv_gemm")
-                .excluding("AbstractConvCheck")
-            )
-
-        output = pytensor.function(
-            inputs=[],
-            outputs=conv2d_transpose(
-                input=pt.ones((2, 2, 4, 4)),
-                filters=pt.ones((2, 1, 4, 4)),
-                output_shape=(2, 1, 10, 10),
-                input_dilation=(2, 2),
-            ),
-            mode=mode,
-        )()
-        expected_output = np.array(
-            [
-                [
-                    [
-                        [2, 2, 4, 4, 4, 4, 4, 4, 2, 2],
-                        [2, 2, 4, 4, 4, 4, 4, 4, 2, 2],
-                        [4, 4, 8, 8, 8, 8, 8, 8, 4, 4],
-                        [4, 4, 8, 8, 8, 8, 8, 8, 4, 4],
-                        [4, 4, 8, 8, 8, 8, 8, 8, 4, 4],
-                        [4, 4, 8, 8, 8, 8, 8, 8, 4, 4],
-                        [4, 4, 8, 8, 8, 8, 8, 8, 4, 4],
-                        [4, 4, 8, 8, 8, 8, 8, 8, 4, 4],
-                        [2, 2, 4, 4, 4, 4, 4, 4, 2, 2],
-                        [2, 2, 4, 4, 4, 4, 4, 4, 2, 2],
-                    ]
-                ]
-            ]
-            * 2
-        )
-        np.testing.assert_equal(output, expected_output)
-
-
-@pytest.mark.skipif(
-    not config.cxx or config.mode == "FAST_COMPILE",
-    reason="Need blas to test conv3d",
-)
-class TestConv2dGrads:
-    def setup_method(self):
-        self.random_stream = np.random.default_rng(utt.fetch_seed())
-
-        self.inputs_shapes = [(8, 1, 12, 12), (1, 1, 5, 5), (1, 1, 5, 6), (1, 1, 6, 6)]
-        self.filters_shapes = [(5, 1, 2, 2), (1, 1, 3, 3)] * 2
-
-        self.subsamples = [(1, 1), (2, 2)]
-        self.border_modes = ["valid", "full"]
-        self.filter_flip = [True, False]
-
-        self.output_grad = tensor4()
-        self.output_grad_wrt = tensor4()
-
-        self.x = tensor4("x", dtype=config.floatX)  # inputs
-        self.w = tensor4("w", dtype=config.floatX)  # filter weights
-
-    def test_conv2d_grad_wrt_inputs(self):
-        # Compares calculated abstract grads wrt inputs with the fwd grads
-        # This method checks the outputs of `conv2_grad_wrt_inputs` against
-        # the outputs of `pytensor.tensor.conv` forward grads to make sure the
-        # results are the same.
-
-        for in_shape, fltr_shape in zip(
-            self.inputs_shapes, self.filters_shapes, strict=True
-        ):
-            for bm in self.border_modes:
-                for ss in self.subsamples:
-                    for ff in self.filter_flip:
-                        input_val = self.random_stream.random(in_shape).astype(
-                            config.floatX
-                        )
-                        filter_val = self.random_stream.random(fltr_shape).astype(
-                            config.floatX
-                        )
-                        out_grad_shape = abstract_conv.get_conv_output_shape(
-                            image_shape=in_shape,
-                            kernel_shape=fltr_shape,
-                            border_mode=bm,
-                            subsample=ss,
-                        )
-                        out_grad_val = self.random_stream.random(out_grad_shape).astype(
-                            config.floatX
-                        )
-                        conv_out = abstract_conv.conv2d(
-                            self.x,
-                            filters=self.w,
-                            border_mode=bm,
-                            subsample=ss,
-                            input_shape=in_shape,
-                            filter_shape=fltr_shape,
-                            filter_flip=ff,
-                        )
-                        conv_grad = pytensor.grad(
-                            conv_out.sum(),
-                            wrt=self.x,
-                            known_grads={conv_out: self.output_grad},
-                        )
-                        f_old = pytensor.function(
-                            [self.x, self.w, self.output_grad], conv_grad
-                        )
-
-                        conv_wrt_i_out = abstract_conv.conv2d_grad_wrt_inputs(
-                            output_grad=self.output_grad_wrt,
-                            filters=self.w,
-                            border_mode=bm,
-                            subsample=ss,
-                            input_shape=in_shape,
-                            filter_shape=fltr_shape,
-                            filter_flip=ff,
-                        )
-                        f_new = pytensor.function(
-                            [self.w, self.output_grad_wrt], conv_wrt_i_out
-                        )
-
-                        # check that they're equal
-                        utt.assert_allclose(
-                            f_new(filter_val, out_grad_val),
-                            f_old(input_val, filter_val, out_grad_val),
-                        )
-
-    def test_conv2d_grad_wrt_weights(self):
-        # Compares calculated abstract grads wrt weights with the fwd grads
-        # This method checks the outputs of `conv2_grad_wrt_weights` against
-        # the outputs of `pytensor.tensor.conv` forward grads to make sure the
-        # results are the same.
-
-        for in_shape, fltr_shape in zip(
-            self.inputs_shapes, self.filters_shapes, strict=True
-        ):
-            for bm in self.border_modes:
-                for ss in self.subsamples:
-                    for ff in self.filter_flip:
-                        input_val = self.random_stream.random(in_shape).astype(
-                            config.floatX
-                        )
-                        filter_val = self.random_stream.random(fltr_shape).astype(
-                            config.floatX
-                        )
-                        out_grad_shape = abstract_conv.get_conv_output_shape(
-                            image_shape=in_shape,
-                            kernel_shape=fltr_shape,
-                            border_mode=bm,
-                            subsample=ss,
-                        )
-                        out_grad_val = self.random_stream.random(out_grad_shape).astype(
-                            config.floatX
-                        )
-                        conv_out = abstract_conv.conv2d(
-                            self.x,
-                            filters=self.w,
-                            border_mode=bm,
-                            subsample=ss,
-                            input_shape=in_shape,
-                            filter_shape=fltr_shape,
-                            filter_flip=ff,
-                        )
-                        conv_grad = pytensor.grad(
-                            conv_out.sum(),
-                            wrt=self.w,
-                            known_grads={conv_out: self.output_grad},
-                        )
-                        f_old = pytensor.function(
-                            [self.x, self.w, self.output_grad], conv_grad
-                        )
-
-                        conv_wrt_w_out = abstract_conv.conv2d_grad_wrt_weights(
-                            self.x,
-                            output_grad=self.output_grad_wrt,
-                            border_mode=bm,
-                            subsample=ss,
-                            input_shape=in_shape,
-                            filter_shape=fltr_shape,
-                            filter_flip=ff,
-                        )
-                        f_new = pytensor.function(
-                            [self.x, self.output_grad_wrt], conv_wrt_w_out
-                        )
-                        utt.assert_allclose(
-                            f_new(input_val, out_grad_val),
-                            f_old(input_val, filter_val, out_grad_val),
-                        )
-
-
-@pytest.mark.skipif(config.cxx == "", reason="cxx needed")
-@pytest.mark.xfail(
-    condition=isinstance(get_default_mode().linker, NumbaLinker),
-    reason="Involves Ops with no Python implementation for numba to use as fallback",
-)
-class TestGroupedConvNoOptim:
-    conv = abstract_conv.AbstractConv2d
-    conv_gradw = abstract_conv.AbstractConv2d_gradWeights
-    conv_gradi = abstract_conv.AbstractConv2d_gradInputs
-    conv_op = abstract_conv.AbstractConv2d
-    conv_gradw_op = abstract_conv.AbstractConv2d_gradWeights
-    conv_gradi_op = abstract_conv.AbstractConv2d_gradInputs
-    mode = Mode(optimizer=None)
-    is_dnn = False
-
-    def setup_method(self):
-        self.num_groups = [3, 2, 4, 4]
-        self.border_mode = "valid"
-        self.subsample = (1, 1)
-        self.img_shape = [(5, 6, 5, 5), (4, 4, 7, 5), (3, 8, 5, 3), (2, 4, 7, 7)]
-        self.kern_shape = [(6, 2, 3, 3), (6, 2, 5, 3), (4, 2, 3, 3), (4, 1, 3, 5)]
-        self.top_shape = [(5, 6, 3, 3), (4, 6, 3, 3), (3, 4, 3, 1), (2, 4, 5, 3)]
-        self.filter_dilation = (1, 1)
-        self.ref_mode = "FAST_RUN"
-        self.convdim = 2
-        self.corr_fwd = conv2d_corr
-        self.corr_gradw = conv2d_corr_gw
-        self.corr_gradi = conv2d_corr_gi
-
-    def test_fwd(self):
-        if self.convdim == 2:
-            img_sym = tensor4("img")
-            kern_sym = tensor4("kern")
-        else:
-            img_sym = tensor5("img")
-            kern_sym = tensor5("kern")
-
-        for imshp, kshp, groups in zip(
-            self.img_shape, self.kern_shape, self.num_groups, strict=True
-        ):
-            img = np.random.random(imshp).astype(config.floatX)
-            kern = np.random.random(kshp).astype(config.floatX)
-            split_imgs = np.split(img, groups, axis=1)
-            split_kern = np.split(kern, groups, axis=0)
-
-            grouped_conv_op = self.conv(
-                border_mode=self.border_mode,
-                subsample=self.subsample,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-            )
-            grouped_conv_output = grouped_conv_op(img_sym, kern_sym)
-
-            grouped_func = pytensor.function(
-                [img_sym, kern_sym], grouped_conv_output, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv_op)
-                for node in grouped_func.maker.fgraph.toposort()
-            )
-            grouped_output = grouped_func(img, kern)
-
-            ref_conv_op = self.corr_fwd(
-                img_sym,
-                kern_sym,
-                border_mode=self.border_mode,
-                subsample=self.subsample,
-                filter_dilation=self.filter_dilation,
-            )
-            ref_func = pytensor.function(
-                [img_sym, kern_sym], ref_conv_op, mode=self.ref_mode
-            )
-            ref_concat_output = [
-                ref_func(img_arr, kern_arr)
-                for img_arr, kern_arr in zip(split_imgs, split_kern, strict=True)
-            ]
-            ref_concat_output = np.concatenate(ref_concat_output, axis=1)
-
-            utt.assert_allclose(grouped_output, ref_concat_output)
-
-            utt.verify_grad(grouped_conv_op, [img, kern], mode=self.mode, eps=1)
-
-    def test_gradweights(self):
-        if self.convdim == 2:
-            img_sym = tensor4("img")
-            top_sym = tensor4("kern")
-        else:
-            img_sym = tensor5("img")
-            top_sym = tensor5("kern")
-        for imshp, kshp, tshp, groups in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.top_shape,
-            self.num_groups,
-            strict=True,
-        ):
-            img = np.random.random(imshp).astype(config.floatX)
-            top = np.random.random(tshp).astype(config.floatX)
-            split_imgs = np.split(img, groups, axis=1)
-            split_top = np.split(top, groups, axis=1)
-
-            grouped_convgrad_op = self.conv_gradw(
-                border_mode=self.border_mode,
-                subsample=self.subsample,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-            )
-            grouped_conv_output = grouped_convgrad_op(
-                img_sym, top_sym, pt.as_tensor_variable(kshp[-self.convdim :])
-            )
-            grouped_func = pytensor.function(
-                [img_sym, top_sym], grouped_conv_output, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv_gradw_op)
-                for node in grouped_func.maker.fgraph.toposort()
-            )
-            grouped_output = grouped_func(img, top)
-
-            ref_conv_op = self.corr_gradw(
-                img_sym,
-                top_sym,
-                kshp,
-                border_mode=self.border_mode,
-                subsample=self.subsample,
-                filter_dilation=self.filter_dilation,
-            )
-            ref_func = pytensor.function(
-                [img_sym, top_sym], ref_conv_op, mode=self.ref_mode
-            )
-            ref_concat_output = [
-                ref_func(img_arr, top_arr)
-                for img_arr, top_arr in zip(split_imgs, split_top, strict=True)
-            ]
-            ref_concat_output = np.concatenate(ref_concat_output, axis=0)
-
-            utt.assert_allclose(grouped_output, ref_concat_output)
-
-            def conv_gradweight(inputs_val, output_val):
-                return grouped_convgrad_op(
-                    inputs_val,
-                    output_val,
-                    pt.as_tensor_variable(kshp[-self.convdim :]),
-                )
-
-            utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
-
-    def test_gradinputs(self):
-        if self.convdim == 2:
-            kern_sym = tensor4("kern")
-            top_sym = tensor4("top")
-        else:
-            kern_sym = tensor5("kern")
-            top_sym = tensor5("top")
-        for imshp, kshp, tshp, groups in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.top_shape,
-            self.num_groups,
-            strict=True,
-        ):
-            kern = np.random.random(kshp).astype(config.floatX)
-            top = np.random.random(tshp).astype(config.floatX)
-            split_kerns = np.split(kern, groups, axis=0)
-            split_top = np.split(top, groups, axis=1)
-
-            grouped_convgrad_op = self.conv_gradi(
-                border_mode=self.border_mode,
-                subsample=self.subsample,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-            )
-            grouped_conv_output = grouped_convgrad_op(
-                kern_sym, top_sym, pt.as_tensor_variable(imshp[-self.convdim :])
-            )
-            grouped_func = pytensor.function(
-                [kern_sym, top_sym], grouped_conv_output, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv_gradi_op)
-                for node in grouped_func.maker.fgraph.toposort()
-            )
-            grouped_output = grouped_func(kern, top)
-
-            ref_conv_op = self.corr_gradi(
-                kern_sym,
-                top_sym,
-                imshp,
-                border_mode=self.border_mode,
-                subsample=self.subsample,
-                filter_dilation=self.filter_dilation,
-            )
-            ref_func = pytensor.function(
-                [kern_sym, top_sym], ref_conv_op, mode=self.ref_mode
-            )
-            ref_concat_output = [
-                ref_func(kern_arr, top_arr)
-                for kern_arr, top_arr in zip(split_kerns, split_top, strict=True)
-            ]
-            ref_concat_output = np.concatenate(ref_concat_output, axis=1)
-
-            utt.assert_allclose(grouped_output, ref_concat_output)
-
-            def conv_gradinputs(filters_val, output_val):
-                return grouped_convgrad_op(
-                    filters_val,
-                    output_val,
-                    pt.as_tensor_variable(imshp[-self.convdim :]),
-                )
-
-            utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
-
-
-@pytest.mark.skipif(config.cxx == "", reason="cxx needed")
-@pytest.mark.xfail(
-    condition=isinstance(get_default_mode().linker, NumbaLinker),
-    reason="Involves Ops with no Python implementation for numba to use as fallback",
-)
-class TestGroupedConv3dNoOptim(TestGroupedConvNoOptim):
-    conv = abstract_conv.AbstractConv3d
-    conv_gradw = abstract_conv.AbstractConv3d_gradWeights
-    conv_gradi = abstract_conv.AbstractConv3d_gradInputs
-    conv_op = abstract_conv.AbstractConv3d
-    conv_gradw_op = abstract_conv.AbstractConv3d_gradWeights
-    conv_gradi_op = abstract_conv.AbstractConv3d_gradInputs
-    mode = Mode(optimizer=None)
-
-    def setup_method(self):
-        self.num_groups = [3, 2, 4, 4]
-        self.border_mode = "valid"
-        self.subsample = (1, 1, 1)
-        self.img_shape = [
-            (2, 6, 5, 5, 5),
-            (1, 4, 7, 5, 7),
-            (1, 8, 5, 3, 5),
-            (2, 4, 7, 7, 7),
-        ]
-        self.kern_shape = [
-            (3, 2, 3, 3, 3),
-            (6, 2, 5, 3, 5),
-            (4, 2, 3, 3, 3),
-            (4, 1, 3, 5, 3),
-        ]
-        self.top_shape = [
-            (2, 3, 3, 3, 3),
-            (1, 6, 3, 3, 3),
-            (1, 4, 3, 1, 3),
-            (2, 4, 5, 3, 5),
-        ]
-        self.filter_dilation = (1, 1, 1)
-        self.ref_mode = "FAST_RUN"
-        self.convdim = 3
-        self.corr_fwd = conv3d_corr
-        self.corr_gradw = conv3d_corr_gw
-        self.corr_gradi = conv3d_corr_gi
-
-
-class TestSeparableConv:
-    def setup_method(self):
-        self.x = np.array(
-            [
-                [
-                    [
-                        [1, 2, 3, 4, 5],
-                        [3, 2, 1, 4, 5],
-                        [3, 3, 1, 3, 6],
-                        [5, 3, 2, 1, 1],
-                        [4, 7, 1, 2, 1],
-                    ],
-                    [
-                        [3, 3, 1, 2, 6],
-                        [6, 5, 4, 3, 1],
-                        [3, 4, 5, 2, 3],
-                        [6, 4, 1, 3, 4],
-                        [2, 3, 4, 2, 5],
-                    ],
-                ]
-            ]
-        ).astype(config.floatX)
-
-        self.depthwise_filter = np.array(
-            [
-                [[[3, 2, 1], [5, 3, 2], [6, 4, 2]]],
-                [[[5, 5, 2], [3, 7, 4], [3, 5, 4]]],
-                [[[7, 4, 7], [5, 3, 3], [1, 3, 1]]],
-                [[[4, 4, 4], [2, 4, 6], [0, 0, 7]]],
-            ]
-        ).astype(config.floatX)
-
-        self.pointwise_filter = np.array(
-            [[[[4]], [[1]], [[3]], [[5]]], [[[2]], [[1]], [[2]], [[8]]]]
-        ).astype(config.floatX)
-
-        self.precomp_output_valid = np.array(
-            [
-                [
-                    [[1385, 1333, 1339], [1382, 1243, 1291], [1303, 1120, 1228]],
-                    [[1532, 1410, 1259], [1522, 1346, 1314], [1379, 1192, 1286]],
-                ]
-            ]
-        ).astype(config.floatX)
-
-        self.precomp_output_full = np.array(
-            [
-                [
-                    [
-                        [140, 266, 343, 206, 59],
-                        [395, 697, 979, 585, 245],
-                        [429, 863, 1385, 919, 453],
-                        [243, 499, 864, 627, 371],
-                        [90, 183, 291, 254, 202],
-                    ],
-                    [
-                        [149, 289, 359, 213, 58],
-                        [400, 750, 1076, 662, 266],
-                        [387, 854, 1532, 1091, 540],
-                        [174, 411, 971, 786, 518],
-                        [51, 110, 286, 299, 298],
-                    ],
-                ]
-            ]
-        ).astype(config.floatX)
-
-    @pytest.mark.skipif(config.cxx == "", reason="test needs cxx")
-    def test_interface2d(self):
-        x_sym = tensor4("x")
-        dfilter_sym = tensor4("d")
-        pfilter_sym = tensor4("p")
-
-        sep_op = separable_conv2d(x_sym, dfilter_sym, pfilter_sym, self.x.shape[1])
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-
-        # test for square matrix
-        top = fun(self.x, self.depthwise_filter, self.pointwise_filter)
-        utt.assert_allclose(top, self.precomp_output_valid)
-
-        # test for non-square matrix
-        top = fun(self.x[:, :, :3, :], self.depthwise_filter, self.pointwise_filter)
-        utt.assert_allclose(top, self.precomp_output_valid[:, :, :1, :])
-
-        # test if it infers shape
-        sep_op = separable_conv2d(
-            x_sym,
-            dfilter_sym,
-            pfilter_sym,
-            self.x.shape[1],
-            input_shape=self.x.shape,
-            depthwise_filter_shape=self.depthwise_filter.shape,
-            pointwise_filter_shape=self.pointwise_filter.shape,
-        )
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-        top = fun(self.x, self.depthwise_filter, self.pointwise_filter)
-        utt.assert_allclose(top, self.precomp_output_valid)
-
-        # test non-default subsample
-        sep_op = separable_conv2d(
-            x_sym, dfilter_sym, pfilter_sym, self.x.shape[1], subsample=(2, 2)
-        )
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-        top = fun(self.x, self.depthwise_filter, self.pointwise_filter)
-        utt.assert_allclose(
-            top, np.delete(np.delete(self.precomp_output_valid, 1, axis=3), 1, axis=2)
-        )
-
-        # test non-default border_mode
-        sep_op = separable_conv2d(
-            x_sym, dfilter_sym, pfilter_sym, self.x.shape[1], border_mode="full"
-        )
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-        top = fun(self.x[:, :, :3, :3], self.depthwise_filter, self.pointwise_filter)
-        utt.assert_allclose(top, self.precomp_output_full)
-
-    @pytest.mark.skipif(config.cxx == "", reason="test needs cxx")
-    def test_interface3d(self):
-        # Expand the filter along the depth
-        x = np.tile(np.expand_dims(self.x, axis=2), (1, 1, 5, 1, 1))
-        depthwise_filter = np.tile(
-            np.expand_dims(self.depthwise_filter, axis=2), (1, 1, 3, 1, 1)
-        )
-        pointwise_filter = np.expand_dims(self.pointwise_filter, axis=2)
-        precomp_output = (
-            np.tile(np.expand_dims(self.precomp_output_valid, axis=2), (1, 1, 3, 1, 1))
-            * 3
-        )
-
-        x_sym = tensor5("x")
-        dfilter_sym = tensor5("d")
-        pfilter_sym = tensor5("p")
-
-        sep_op = separable_conv3d(x_sym, dfilter_sym, pfilter_sym, x.shape[1])
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-
-        # test for square matrix
-        top = fun(x, depthwise_filter, pointwise_filter)
-        utt.assert_allclose(top, precomp_output)
-        # test for non-square matrix
-        top = fun(x[:, :, :3, :, :3], depthwise_filter, pointwise_filter)
-        utt.assert_allclose(top, precomp_output[:, :, :1, :, :1])
-        # test if it infers shape
-        sep_op = separable_conv3d(
-            x_sym,
-            dfilter_sym,
-            pfilter_sym,
-            x.shape[1],
-            input_shape=x.shape,
-            depthwise_filter_shape=depthwise_filter.shape,
-            pointwise_filter_shape=pointwise_filter.shape,
-        )
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-        top = fun(x, depthwise_filter, pointwise_filter)
-        utt.assert_allclose(top, precomp_output)
-
-        # test non-default subsample
-        sep_op = separable_conv3d(
-            x_sym, dfilter_sym, pfilter_sym, x.shape[1], subsample=(2, 2, 2)
-        )
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-        top = fun(x, depthwise_filter, pointwise_filter)
-        utt.assert_allclose(
-            top,
-            np.delete(
-                np.delete(np.delete(precomp_output, 1, axis=4), 1, axis=3), 1, axis=2
-            ),
-        )
-        # test non-default border_mode
-        precomp_output = np.tile(
-            np.expand_dims(self.precomp_output_full, axis=2), (1, 1, 5, 1, 1)
-        ) * np.array([[[[[1]], [[2]], [[3]], [[2]], [[1]]]]])
-
-        sep_op = separable_conv3d(
-            x_sym, dfilter_sym, pfilter_sym, x.shape[1], border_mode="full"
-        )
-        fun = pytensor.function(
-            [x_sym, dfilter_sym, pfilter_sym], sep_op, mode="FAST_RUN"
-        )
-        top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
-        utt.assert_allclose(top, precomp_output)
-
-
-@pytest.mark.skipif(
-    config.cxx == "",
-    reason="SciPy and cxx needed",
-)
-class TestUnsharedConv:
-    conv2d = abstract_conv.AbstractConv2d
-    conv2d_gradw = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi = abstract_conv.AbstractConv2d_gradInputs
-    conv2d_op = abstract_conv.AbstractConv2d
-    conv2d_gradw_op = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi_op = abstract_conv.AbstractConv2d_gradInputs
-
-    mode = Mode(optimizer="None")
-
-    def setup_method(self):
-        self.img_shape = [(2, 2, 4, 4), (3, 2, 4, 2), (3, 3, 5, 3), (3, 4, 4, 4)]
-        self.kern_shape = [
-            (2, 2, 2, 2, 3, 3),
-            (2, 4, 2, 2, 4, 2),
-            (3, 2, 1, 1, 3, 3),
-            (4, 3, 3, 2, 4, 2),
-        ]
-        self.topgrad_shape = [(2, 2, 2, 2), (3, 2, 4, 2), (3, 3, 2, 1), (3, 4, 3, 3)]
-        self.border_mode = ["valid", "full", "valid", "full"]
-        self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
-        self.filter_dilation = (1, 1)
-        self.num_groups = [1, 1, 3, 2]
-
-        # self.verify_flags = np.random.choice([True, False], 4, [0.5, 0.5])
-        # Above line can be used instead if speed is a concern
-        self.verify_flags = [True] * 4
-
-        self.ref_mode = "FAST_RUN"
-
-    def test_fwd(self):
-        tensor6 = TensorType(config.floatX, shape=(None,) * 6)
-        img_sym = tensor4("img")
-        kern_sym = tensor6("kern")
-        ref_kern_sym = tensor4("ref_kern")
-
-        for imshp, kshp, mode, sub, groups, verify in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.border_mode,
-            self.subsample,
-            self.num_groups,
-            self.verify_flags,
-            strict=True,
-        ):
-            img = np.random.random(imshp).astype(config.floatX)
-            kern = np.random.random(kshp).astype(config.floatX)
-
-            unshared_conv_op = self.conv2d(
-                border_mode=mode,
-                subsample=sub,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-                unshared=True,
-            )
-            unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
-            unshared_func = pytensor.function(
-                [img_sym, kern_sym], unshared_out_sym, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv2d_op)
-                for node in unshared_func.maker.fgraph.toposort()
-            )
-            unshared_output = unshared_func(img, kern)
-
-            single_kshp = kshp[:1] + kshp[3:]
-
-            ref_conv_op = self.conv2d(
-                border_mode=mode,
-                subsample=sub,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-                unshared=False,
-            )
-            ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
-            ref_func = pytensor.function(
-                [img_sym, ref_kern_sym], ref_out_sym, mode=self.mode
-            )
-
-            for i in range(0, kshp[1]):
-                for j in range(0, kshp[2]):
-                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
-                    ref_val = ref_func(img, single_kern)
-                    utt.assert_allclose(
-                        ref_val[:, :, i, j], unshared_output[:, :, i, j]
-                    )
-
-            if verify:
-                utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
-
-    def test_gradweight(self):
-        img_sym = tensor4("img")
-        top_sym = tensor4("top")
-
-        for imshp, kshp, topshp, mode, sub, groups, verify in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.topgrad_shape,
-            self.border_mode,
-            self.subsample,
-            self.num_groups,
-            self.verify_flags,
-            strict=True,
-        ):
-            img = np.random.random(imshp).astype(config.floatX)
-            top = np.random.random(topshp).astype(config.floatX)
-
-            unshared_conv_op = self.conv2d_gradw(
-                border_mode=mode,
-                subsample=sub,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-                unshared=True,
-            )
-            unshared_out_sym = unshared_conv_op(
-                img_sym, top_sym, pt.as_tensor_variable(kshp[-2:])
-            )
-            unshared_func = pytensor.function(
-                [img_sym, top_sym], unshared_out_sym, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv2d_gradw_op)
-                for node in unshared_func.maker.fgraph.toposort()
-            )
-            unshared_output = unshared_func(img, top)
-
-            single_kshp = kshp[:1] + kshp[3:]
-
-            ref_conv_op = self.conv2d_gradw(
-                border_mode=mode,
-                subsample=sub,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-                unshared=False,
-            )
-            ref_out_sym = ref_conv_op(
-                img_sym, top_sym, pt.as_tensor_variable(single_kshp[-2:])
-            )
-            ref_func = pytensor.function(
-                [img_sym, top_sym], ref_out_sym, mode=self.mode
-            )
-
-            for i in range(0, topshp[2]):
-                for j in range(0, topshp[3]):
-                    top_single = np.zeros_like(top)
-                    top_single[:, :, i, j] = top[:, :, i, j]
-                    ref_output = ref_func(img, top_single)
-                    utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
-
-            def conv_gradweight(inputs_val, output_val):
-                return unshared_conv_op(
-                    inputs_val, output_val, pt.as_tensor_variable(kshp[-2:])
-                )
-
-            if verify:
-                utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
-
-    def test_gradinput(self):
-        tensor6 = TensorType(config.floatX, shape=(None,) * 6)
-        kern_sym = tensor6("kern")
-        top_sym = tensor4("top")
-        ref_kern_sym = tensor4("ref_kern")
-
-        for imshp, kshp, topshp, mode, sub, groups, verify in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.topgrad_shape,
-            self.border_mode,
-            self.subsample,
-            self.num_groups,
-            self.verify_flags,
-            strict=True,
-        ):
-            single_kshp = kshp[:1] + kshp[3:]
-
-            kern = np.random.random(kshp).astype(config.floatX)
-            top = np.random.random(topshp).astype(config.floatX)
-
-            unshared_conv_op = self.conv2d_gradi(
-                border_mode=mode,
-                subsample=sub,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-                unshared=True,
-            )
-            unshared_out_sym = unshared_conv_op(
-                kern_sym, top_sym, pt.as_tensor_variable(imshp[-2:])
-            )
-            unshared_func = pytensor.function(
-                [kern_sym, top_sym], unshared_out_sym, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv2d_gradi_op)
-                for node in unshared_func.maker.fgraph.toposort()
-            )
-            unshared_output = unshared_func(kern, top)
-
-            ref_conv_op = self.conv2d_gradi(
-                border_mode=mode,
-                subsample=sub,
-                filter_dilation=self.filter_dilation,
-                num_groups=groups,
-                unshared=False,
-            )
-            ref_out_sym = ref_conv_op(
-                ref_kern_sym, top_sym, pt.as_tensor_variable(imshp[-2:])
-            )
-            ref_func = pytensor.function(
-                [ref_kern_sym, top_sym], ref_out_sym, mode=self.mode
-            )
-
-            ref_output = np.zeros(imshp)
-
-            for i in range(0, topshp[2]):
-                for j in range(0, topshp[3]):
-                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
-                    top_single = np.zeros_like(top)
-                    top_single[:, :, i, j] = top[:, :, i, j]
-                    ref_output += ref_func(single_kern, top_single)
-
-            utt.assert_allclose(ref_output, unshared_output)
-
-            def conv_gradinputs(filters_val, output_val):
-                return unshared_conv_op(
-                    filters_val, output_val, pt.as_tensor_variable(imshp[-2:])
-                )
-
-            if verify:
-                utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
-
-
-class TestAsymmetricPadding:
-    conv2d = abstract_conv.AbstractConv2d
-    conv2d_gradw = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi = abstract_conv.AbstractConv2d_gradInputs
-    conv2d_op = abstract_conv.AbstractConv2d
-    conv2d_gradw_op = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi_op = abstract_conv.AbstractConv2d_gradInputs
-
-    mode = Mode(optimizer="None")
-
-    img_shape = [(2, 2, 4, 4), (3, 2, 4, 2), (3, 3, 5, 3)]
-    kern_shape = [(4, 2, 2, 2), (2, 2, 4, 2), (2, 3, 3, 3)]
-    topgrad_shape = [(2, 4, 6, 6), (3, 2, 3, 4), (3, 2, 6, 1)]
-    border_mode = [((1, 2), (2, 1)), ((1, 1), (0, 3)), ((2, 1), (0, 0))]
-
-    @pytest.mark.skipif(
-        config.cxx == "",
-        reason="SciPy and cxx needed",
-    )
-    def test_fwd(self):
-        img_sym = tensor4("img")
-        kern_sym = tensor4("kern")
-
-        for imshp, kshp, pad in zip(
-            self.img_shape, self.kern_shape, self.border_mode, strict=True
-        ):
-            img = np.random.random(imshp).astype(config.floatX)
-            kern = np.random.random(kshp).astype(config.floatX)
-
-            asymmetric_conv_op = self.conv2d(
-                border_mode=pad, subsample=(1, 1), filter_dilation=(1, 1)
-            )
-            asymmetric_out_sym = asymmetric_conv_op(img_sym, kern_sym)
-            asymmetric_func = pytensor.function(
-                [img_sym, kern_sym], asymmetric_out_sym, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv2d_op)
-                for node in asymmetric_func.maker.fgraph.toposort()
-            )
-            asymmetric_output = asymmetric_func(img, kern)
-
-            ref_conv_op = self.conv2d(
-                border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1)
-            )
-            ref_out_sym = ref_conv_op(img_sym, kern_sym)
-            ref_func = pytensor.function(
-                [img_sym, kern_sym], ref_out_sym, mode=self.mode
-            )
-
-            exp_imshp = (
-                imshp[0],
-                imshp[1],
-                imshp[2] + pad[0][0] + pad[0][1],
-                imshp[3] + pad[1][0] + pad[1][1],
-            )
-
-            exp_img = np.zeros(exp_imshp, dtype=config.floatX)
-            exp_img[
-                :, :, pad[0][0] : imshp[2] + pad[0][0], pad[1][0] : imshp[3] + pad[1][0]
-            ] = img
-            ref_output = ref_func(exp_img, kern)
-
-            utt.assert_allclose(asymmetric_output, ref_output)
-
-            utt.verify_grad(asymmetric_conv_op, [img, kern], mode=self.mode, eps=1)
-
-    @pytest.mark.skipif(
-        config.cxx == "",
-        reason="SciPy and cxx needed",
-    )
-    def test_gradweight(self):
-        img_sym = tensor4("img")
-        top_sym = tensor4("top")
-
-        for imshp, kshp, topshp, pad in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.topgrad_shape,
-            self.border_mode,
-            strict=True,
-        ):
-            img = np.random.random(imshp).astype(config.floatX)
-            top = np.random.random(topshp).astype(config.floatX)
-
-            asymmetric_conv_op = self.conv2d_gradw(
-                border_mode=pad, subsample=(1, 1), filter_dilation=(1, 1)
-            )
-            asymmetric_out_sym = asymmetric_conv_op(img_sym, top_sym, kshp[-2:])
-            asymmetric_func = pytensor.function(
-                [img_sym, top_sym], asymmetric_out_sym, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv2d_gradw_op)
-                for node in asymmetric_func.maker.fgraph.toposort()
-            )
-            asymmetric_output = asymmetric_func(img, top)
-
-            ref_conv_op = self.conv2d_gradw(
-                border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1)
-            )
-            ref_out_sym = ref_conv_op(img_sym, top_sym, kshp[-2:])
-            ref_func = pytensor.function(
-                [img_sym, top_sym], ref_out_sym, mode=self.mode
-            )
-
-            exp_imshp = (
-                imshp[0],
-                imshp[1],
-                imshp[2] + pad[0][0] + pad[0][1],
-                imshp[3] + pad[1][0] + pad[1][1],
-            )
-
-            exp_img = np.zeros(exp_imshp, dtype=config.floatX)
-            exp_img[
-                :, :, pad[0][0] : imshp[2] + pad[0][0], pad[1][0] : imshp[3] + pad[1][0]
-            ] = img
-            ref_output = ref_func(exp_img, top)
-
-            utt.assert_allclose(asymmetric_output, ref_output)
-
-            def conv_gradweight(inputs_val, output_val):
-                return asymmetric_conv_op(
-                    inputs_val, output_val, pt.as_tensor_variable(kshp[-2:])
-                )
-
-            utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
-
-    @pytest.mark.skipif(
-        config.cxx == "",
-        reason="SciPy and cxx needed",
-    )
-    def test_gradinput(self):
-        kern_sym = tensor4("kern")
-        top_sym = tensor4("top")
-
-        for imshp, kshp, topshp, pad in zip(
-            self.img_shape,
-            self.kern_shape,
-            self.topgrad_shape,
-            self.border_mode,
-            strict=True,
-        ):
-            kern = np.random.random(kshp).astype(config.floatX)
-            top = np.random.random(topshp).astype(config.floatX)
-
-            asymmetric_conv_op = self.conv2d_gradi(
-                border_mode=pad, subsample=(1, 1), filter_dilation=(1, 1)
-            )
-            asymmetric_out_sym = asymmetric_conv_op(kern_sym, top_sym, imshp[-2:])
-            asymmetric_func = pytensor.function(
-                [kern_sym, top_sym], asymmetric_out_sym, mode=self.mode
-            )
-            assert any(
-                isinstance(node.op, self.conv2d_gradi_op)
-                for node in asymmetric_func.maker.fgraph.toposort()
-            )
-            asymmetric_output = asymmetric_func(kern, top)
-
-            ref_conv_op = self.conv2d_gradi(
-                border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1)
-            )
-            exp_imshp = [
-                imshp[2] + pad[0][0] + pad[0][1],
-                imshp[3] + pad[1][0] + pad[1][1],
-            ]
-            ref_out_sym = ref_conv_op(kern_sym, top_sym, exp_imshp)
-            ref_func = pytensor.function(
-                [kern_sym, top_sym], ref_out_sym, mode=self.mode
-            )
-
-            ref_output = ref_func(kern, top)
-
-            ref_output = ref_output[
-                :, :, pad[0][0] : imshp[2] + pad[0][0], pad[1][0] : imshp[3] + pad[1][0]
-            ]
-
-            utt.assert_allclose(asymmetric_output, ref_output)
-
-            def conv_gradinputs(filters_val, output_val):
-                return asymmetric_conv_op(
-                    filters_val, output_val, pt.as_tensor_variable(imshp[-2:])
-                )
-
-            utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
-
-
-class TestCausalConv:
-    mode = Mode(optimizer="None")
-
-    img = np.array(
-        [
-            [[2, 4, 9, 5, 8], [0, 0, 4, 0, 5]],
-            [[2, 5, 8, 5, 5], [1, 3, 0, 7, 9]],
-            [[7, 0, 7, 1, 0], [0, 1, 4, 7, 2]],
-        ]
-    ).astype(config.floatX)
-    kern = np.array([[[5, 3, 1], [3, 1, 0]], [[6, 4, 9], [2, 2, 7]]]).astype(
-        config.floatX
-    )
-    dilation = 2
-    precomp_top = np.array(
-        [
-            [[10, 20, 63, 37, 88], [12, 24, 70, 46, 120]],
-            [[13, 34, 47, 64, 78], [14, 36, 58, 70, 105]],
-            [[35, 3, 68, 27, 38], [42, 2, 78, 22, 103]],
-        ]
-    ).astype(config.floatX)
-
-    @pytest.mark.skipif(
-        config.cxx == "",
-        reason="SciPy and cxx needed",
-    )
-    def test_interface(self):
-        img_sym = tensor3("img")
-        kern_sym = tensor3("kern")
-        sym_out = causal_conv1d(
-            img_sym, kern_sym, self.kern.shape, filter_dilation=self.dilation
-        )
-
-        causal_func = pytensor.function([img_sym, kern_sym], sym_out, mode=self.mode)
-
-        output = causal_func(self.img, self.kern)
-
-        utt.assert_allclose(output, self.precomp_top)
-
-        def causal_conv_fn(inputs_val, filters_val):
-            return causal_conv1d(
-                inputs_val, filters_val, self.kern.shape, filter_dilation=1
-            )
-
-        utt.verify_grad(causal_conv_fn, [self.img, self.kern], mode=self.mode, eps=1)