remove deprecated pytensor.tensor.nnet

02974b9d · Maxim Kochurov · Maxim Kochurov · 15637d23 · 02974b9d · 02974b9d
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -77,7 +77,7 @@ jobs:
          - "tests/tensor tests/sparse --ignore=tests/tensor/test_basic.py --ignore=tests/tensor/test_math.py --ignore=tests/tensor/test_math_scipy.py --ignore=tests/tensor/test_inplace.py --ignore=tests/tensor/test_elemwise.py --ignore=tests/tensor/rewriting/test_basic.py --ignore=tests/tensor/rewriting/test_math.py --ignore=tests/tensor/nnet --ignore=tests/tensor/signal"
          - "tests/tensor/test_basic.py tests/tensor/test_math.py tests/tensor/test_math_scipy.py tests/tensor/test_inplace.py"
          - "tests/tensor/test_elemwise.py tests/tensor/rewriting/test_basic.py tests/tensor/rewriting/test_math.py"
-          - "tests/tensor/nnet/test_conv.py"
+          - "tests/tensor/conv/test_abstract_conv.py"
        include:
          - python-version: "3.7"
            fast-compile: 1

--- a/doc/library/tensor/conv.rst
+++ b/doc/library/tensor/conv.rst
+=========================================
+:mod:`tensor.conv` -- Tensor Convolutions
+=========================================
+.. module:: tensor.conv
+   :platform: Unix, Windows
+   :synopsis: Tensor Convolutions
+.. moduleauthor:: LISA, PyMC Developers, PyTensor Developers
+.. automodule:: pytensor.tensor.conv
+    :members:
\ No newline at end of file
--- a/doc/library/tensor/index.rst
+++ b/doc/library/tensor/index.rst
@@ -26,5 +26,6 @@ They are grouped into the following sections:
    slinalg
    nlinalg
    fft
+    conv
    math_opt
    basic_opt
--- a/pytensor/scalar/basic_scipy.py
+++ b/pytensor/scalar/basic_scipy.py
-import warnings
-warnings.warn(
-    "The module `pytensor.scalar.basic_scipy` is deprecated "
-    "and has been renamed to `pytensor.scalar.math`",
-    DeprecationWarning,
-    stacklevel=2,
-)
--- a/pytensor/tensor/conv/__init__.py
+++ b/pytensor/tensor/conv/__init__.py
+from .abstract_conv import (
+    bilinear_upsampling,
+    causal_conv1d,
+    conv2d,
+    conv2d_transpose,
+    conv3d,
+    frac_bilinear_upsampling,
+    separable_conv2d,
+    separable_conv3d,
+)
--- a/pytensor/tensor/nnet/abstract_conv.py
+++ b/pytensor/tensor/nnet/abstract_conv.py
@@ -5,14 +5,8 @@ Abstract conv interface
 import logging
 import sys
-try:
-    from math import gcd
-except ImportError:
-    from fractions import gcd
 import warnings
+from math import gcd
 import numpy as np
@@ -35,8 +29,7 @@ from pytensor.tensor.exceptions import NotScalarConstantError
 from pytensor.tensor.var import TensorConstant, TensorVariable
-__docformat__ = "restructuredtext en"
+_logger = logging.getLogger(__name__)
-_logger = logging.getLogger("pytensor.tensor.nnet.abstract_conv")
 def get_conv_output_shape(
@@ -678,7 +671,7 @@ def abstract_conv2d(
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
    after Convolutional Neural Networks (CNN).
-    Refer to :func:`nnet.conv2d <pytensor.tensor.nnet.conv2d>` for a more detailed documentation.
+    Refer to :func:`nnet.conv2d <pytensor.tensor.conv.conv2d>` for a more detailed documentation.
    """
    input = as_tensor_variable(input)
@@ -2430,7 +2423,7 @@ class BaseAbstractConv(Op):
 class AbstractConv(BaseAbstractConv):
    """Abstract Op for the forward convolution.
-    Refer to :func:`BaseAbstractConv <pytensor.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    Refer to :func:`BaseAbstractConv <pytensor.tensor.conv.abstract_conv.BaseAbstractConv>`
    for a more detailed documentation.
    """
@@ -2646,7 +2639,7 @@ class AbstractConv(BaseAbstractConv):
 class AbstractConv2d(AbstractConv):
    """Abstract Op for the forward convolution.
-    Refer to :func:`BaseAbstractConv <pytensor.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    Refer to :func:`BaseAbstractConv <pytensor.tensor.conv.abstract_conv.BaseAbstractConv>`
    for a more detailed documentation.
    """
@@ -2708,7 +2701,7 @@ class AbstractConv2d(AbstractConv):
 class AbstractConv3d(AbstractConv):
    """Abstract Op for the forward convolution.
-    Refer to :func:`BaseAbstractConv <pytensor.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    Refer to :func:`BaseAbstractConv <pytensor.tensor.conv.abstract_conv.BaseAbstractConv>`
    for a more detailed documentation.
    """
@@ -3489,11 +3482,9 @@ def conv2d(
    border_mode="valid",
    subsample=(1, 1),
    filter_flip=True,
-    image_shape=None,
    filter_dilation=(1, 1),
    num_groups=1,
    unshared=False,
-    **kwargs,
 ):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
@@ -3584,36 +3575,6 @@ def conv2d(
        of shape (batch size, output channels, output rows, output columns)
    """
-    if "imshp_logical" in kwargs or "kshp_logical" in kwargs:
-        raise ValueError(
-            "Keyword arguments 'imshp_logical' and 'kshp_logical' for conv2d "
-            "are not supported anymore (and have not been a reliable way to "
-            "perform upsampling). That feature is still available by calling "
-            "pytensor.tensor.nnet.conv.conv2d() for the time being."
-        )
-    if len(kwargs.keys()) > 0:
-        warnings.warn(
-            str(kwargs.keys()) + " are now deprecated in "
-            "`tensor.nnet.abstract_conv.conv2d` interface"
-            " and will be ignored.",
-            stacklevel=2,
-        )
-    if image_shape is not None:
-        warnings.warn(
-            "The `image_shape` keyword argument to "
-            "`tensor.nnet.conv2d` is deprecated, it has been "
-            "renamed to `input_shape`.",
-            stacklevel=2,
-        )
-        if input_shape is None:
-            input_shape = image_shape
-        else:
-            raise ValueError(
-                "input_shape and image_shape should not"
-                " be provided at the same time."
-            )
    return abstract_conv2d(
        input,
        filters,

--- a/pytensor/tensor/nnet/__init__.py
+++ b/pytensor/tensor/nnet/__init__.py
-import warnings
-warnings.warn(
-    "The module `pytensor.tensor.nnet` is deprecated and will "
-    "be removed from PyTensor in version 2.9.0",
-    DeprecationWarning,
-    stacklevel=2,
-)
-import pytensor.tensor.nnet.rewriting
-from pytensor.tensor.nnet.abstract_conv import (
-    abstract_conv2d,
-    conv2d,
-    conv2d_grad_wrt_inputs,
-    conv2d_transpose,
-    conv3d,
-    separable_conv2d,
-)
-from pytensor.tensor.nnet.basic import (
-    binary_crossentropy,
-    categorical_crossentropy,
-    confusion_matrix,
-    crossentropy_categorical_1hot,
-    crossentropy_categorical_1hot_grad,
-    crossentropy_softmax_1hot,
-    crossentropy_softmax_1hot_with_bias,
-    crossentropy_softmax_1hot_with_bias_dx,
-    crossentropy_softmax_argmax_1hot_with_bias,
-    crossentropy_softmax_max_and_argmax_1hot,
-    crossentropy_softmax_max_and_argmax_1hot_with_bias,
-    crossentropy_to_crossentropy_with_softmax,
-    crossentropy_to_crossentropy_with_softmax_with_bias,
-    elu,
-    graph_merge_softmax_with_crossentropy_softmax,
-    h_softmax,
-    logsoftmax,
-    prepend_0_to_each_row,
-    prepend_1_to_each_row,
-    prepend_scalar_to_each_row,
-    relu,
-    selu,
-    sigmoid_binary_crossentropy,
-    softmax,
-    softmax_grad_legacy,
-    softmax_legacy,
-    softmax_simplifier,
-    softmax_with_bias,
-    softsign,
-)
-from pytensor.tensor.nnet.batchnorm import batch_normalization
-from pytensor.tensor.nnet.sigm import hard_sigmoid, ultra_fast_sigmoid
--- a/pytensor/tensor/nnet/basic.py
+++ b/pytensor/tensor/nnet/basic.py
-"""
-Provides neural-network specific Ops.
-Notes
-----
-TODO: factor this out into a neural-network toolbox.
-"""
-import numpy as np
-import pytensor
-from pytensor import scalar as aes
-from pytensor.compile import optdb
-from pytensor.gradient import DisconnectedType, grad_not_implemented
-from pytensor.graph.basic import Apply
-from pytensor.graph.op import Op
-from pytensor.graph.rewriting.basic import (
-    copy_stack_trace,
-    graph_rewriter,
-    node_rewriter,
-)
-from pytensor.link.c.op import COp
-from pytensor.raise_op import Assert
-from pytensor.scalar import UnaryScalarOp
-from pytensor.tensor import basic as at
-from pytensor.tensor.basic import ARange
-from pytensor.tensor.elemwise import DimShuffle, Elemwise
-from pytensor.tensor.exceptions import NotScalarConstantError
-from pytensor.tensor.extra_ops import Unique
-from pytensor.tensor.math import (
-    MaxAndArgmax,
-    Sum,
-    add,
-    dot,
-    eq,
-    exp,
-    expm1,
-    log,
-    max_and_argmax,
-    mul,
-    neg,
-    or_,
-    sigmoid,
-    softplus,
-)
-from pytensor.tensor.math import sum as at_sum
-from pytensor.tensor.math import tanh, tensordot, true_div
-from pytensor.tensor.nnet.blocksparse import sparse_block_dot
-from pytensor.tensor.rewriting.basic import (
-    register_canonicalize,
-    register_specialize,
-    register_stabilize,
-)
-from pytensor.tensor.rewriting.math import local_mul_canonizer
-from pytensor.tensor.shape import Shape, shape_padleft
-from pytensor.tensor.special import Softmax, SoftmaxGrad, log_softmax, softmax
-from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor
-from pytensor.tensor.type import (
-    TensorType,
-    discrete_dtypes,
-    float_dtypes,
-    integer_dtypes,
-)
-class SoftmaxWithBias(COp):
-    """
-    An L{Op} for the output of neural-net multiclass classifiers.
-    Attributes
-    ----------
-    x : a matrix of floats (32 or 64)
-    b : a [row] vector of floats (32 or 64), length is number of cols in x
-    This L{Op}'s output is softmax(x+b).
-    softmax(x[i]) is the i'th distribution over len(x[i]) options.
-    """
-    nin = 2
-    nout = 1
-    __props__ = ()
-    def make_node(self, x, b):
-        x = at.as_tensor_variable(x)
-        b = at.as_tensor_variable(b)
-        if x.type.ndim != 2 or x.type.dtype not in float_dtypes:
-            raise ValueError("x must be 2-d tensor of floats")
-        if b.type.ndim != 1 or b.type.dtype not in float_dtypes:
-            raise ValueError("b must be 1-d tensor of floats")
-        sm = x.type()
-        return Apply(self, [x, b], [sm])
-    def perform(self, node, input_storage, output_storage):
-        x, b = input_storage
-        if b.shape[0] != x.shape[1]:
-            raise ValueError("b must have same number of columns as x")
-        # sm = numpy.zeros_like(x)
-        # for i in range(sm.shape[0]):
-        # row = x[i] + b
-        # sm[i] = numpy.exp(row - numpy.max(row))
-        # sm[i] *= 1.0 / numpy.sum(sm[i])
-        # output_storage[0][0] = sm
-        if x.size == 0:
-            # Numpy doesn't like the max of a zero-sized object.
-            output_storage[0][0] = np.zeros(x.shape, dtype=x.dtype)
-            return
-        x_dtype = x.dtype
-        # Perform computations in float32 otherwise the result is too imprecise
-        if x.dtype == "float16":
-            x = x.astype("float32")
-        x_plus_b = x + b[None, :]
-        e_x = np.exp(x_plus_b - x_plus_b.max(axis=1)[:, None])
-        e_x *= 1.0 / e_x.sum(axis=1)[:, None]
-        # default for copy is True and we don't need a copy if the
-        # data type matches.
-        output_storage[0][0] = e_x.astype(x_dtype, copy=False)
-    def L_op(self, inp, outputs, grads):
-        x, b = inp
-        (g_sm,) = grads
-        if isinstance(g_sm.type, DisconnectedType):
-            return [DisconnectedType()(), DisconnectedType()()]
-        dx = softmax_grad_legacy(g_sm, outputs[0])
-        db = at_sum(dx, axis=0)
-        return dx, db
-    def infer_shape(self, fgraph, node, shape):
-        return [shape[0]]
-    def c_headers(self, **kwargs):
-        return ["<iostream>", "<cmath>"]
-    @staticmethod
-    def c_code_template(dtype):
-        # this implementation was lifted from
-        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
-        # TODO: put this into a templated function, in the support code
-        # TODO: declare the max of each row as an Op output
-        # TODO: set error messages for failures in this code
-        # TODO: use this to accept float32 and int32:
-        # node.inputs[0].type.dtype_specs()[1]
-        init_decl = """
-        npy_intp* Nx = PyArray_DIMS(%(x)s);
-        npy_intp Sx = 0;
-        npy_intp Sb = 0;
-        npy_intp Ssm = 0;
-        if (PyArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
-            %(fail)s;
-        }
-        if (PyArray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
-            %(fail)s;
-        }
-        if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(x)s) != NPY_FLOAT))
-        {
-            PyErr_SetString(PyExc_TypeError, "not a float");
-            %(fail)s;
-        }
-        if ((PyArray_TYPE(%(b)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(b)s) != NPY_FLOAT))
-        {
-            PyErr_SetString(PyExc_TypeError, "b not float");
-            %(fail)s;
-        }
-        if ((PyArray_DIMS(%(x)s)[1] != PyArray_DIMS(%(b)s)[0]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "number of columns in x (%%ld) does not match length of b (%%ld)",
-                (long int)PyArray_DIMS(%(x)s)[1], (long int)PyArray_DIMS(%(b)s)[0]);
-            %(fail)s;
-        }
-        if ((NULL == %(sm)s)
-            || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
-            || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
-        {
-            if (NULL != %(sm)s) Py_XDECREF(%(sm)s);
-            %(sm)s = (PyArrayObject*)PyArray_SimpleNew(2, PyArray_DIMS(%(x)s),
-                                                       PyArray_TYPE(%(x)s));
-            if(!%(sm)s) {
-                PyErr_SetString(PyExc_MemoryError,
-                     "failed to alloc sm output");
-                %(fail)s
-            }
-        }
-        Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
-        Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
-        Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
-        """
-        begin_row_loop = """
-        for (size_t i = 0; i < Nx[0]; ++i)
-        {
-            size_t j;
-            double sum = 0.0;
-            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
-            const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
-            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
-            npy_intp Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
-            npy_intp Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
-            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
-            size_t row_max_j=0;
-            dtype_%(sm)s row_max = x_i[0] + b_i[0];
-            //std::cout << "0 " << row_max << "\\n";
-            // Get the maximum value of the row
-            for (j = 1; j < Nx[1]; ++j)
-            {
-                dtype_%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
-                //std::cout << "1 " << row_ij << "\\n";
-                row_max_j = (row_ij > row_max) ? j : row_max_j;
-                row_max   = (row_ij > row_max) ? row_ij : row_max;
-            }
-        """
-        inside_row_loop = """
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                dtype_%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
-                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
-                dtype_%(sm)s sm_ij = exp(row_ij - row_max);
-                //std::cout << "3 " << j << " " << sm_ij << "\\n";
-                sum += sm_ij;
-                sm_i[j * Ssm] = sm_ij;
-            }
-            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
-            double sum_inv = 1.0 / sum;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sm_i[j * Ssm] *= sum_inv;
-            }
-        """
-        # Get the vectorized version of exp if it exist
-        try:
-            vec_exp = pytensor.scalar.exp.c_code_contiguous_raw(
-                dtype, "Nx[1]", "sm_i", "sm_i"
-            )
-            inside_row_loop_contig = (
-                """
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                dtype_%%(sm)s row_ij = x_i[j * Sx] +  b_i[j * Sb];
-                //std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
-                dtype_%%(sm)s sm_ij = row_ij - row_max;
-                //std::cout << "3 " << j << " " << sm_ij << "\\n";
-                sm_i[j * Ssm] = sm_ij;
-            }
-            %(vec_exp)s;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sum += sm_i[j * Ssm];
-            }
-            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
-            double sum_inv = 1.0 / sum;
-            for (j = 0; j < Nx[1]; ++j)
-            {
-                sm_i[j * Ssm] *= sum_inv;
-            }
-        """
-                % locals()
-            )
-            inside_row_loop = (
-                """
-            if(Ssm == 1){
-                %(inside_row_loop_contig)s
-            }else{
-                %(inside_row_loop)s
-            }
-            """
-                % locals()
-            )
-        except pytensor.graph.utils.MethodNotDefined:
-            pass
-        end_row_loop = """
-        }
-        """
-        return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
-    def c_code(self, node, name, inp, out, sub):
-        x, b = inp
-        (sm,) = out
-        code_template = "".join(
-            self.c_code_template(node.inputs[0].type.dtype_specs()[1])
-        )
-        return code_template % dict(locals(), **sub)
-    @staticmethod
-    def c_code_cache_version():
-        return (8,)
-softmax_with_bias = SoftmaxWithBias()
-softmax_grad_legacy = SoftmaxGrad(axis=-1)
-softmax_legacy = Softmax(axis=-1)
-@register_specialize("fast_compile")
-@node_rewriter([softmax_legacy])
-def local_softmax_with_bias(fgraph, node):
-    """
-    Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias).
-    """
-    if node.op == softmax_legacy and node.outputs[0].ndim == 2:
-        (x,) = node.inputs
-        if x.owner and x.owner.op == add:
-            vectors = []
-            non_vectors = []
-            for x_in in x.owner.inputs:
-                if list(x_in.type.broadcastable) == [True, False]:
-                    # print isinstance(x_in.owner.op,
-                    # DimShuffle) since specialization comes
-                    # relatively late in optimization, we don't want to
-                    # put in extra DimShuffles un-necessarily.
-                    if (
-                        x_in.owner
-                        and isinstance(x_in.owner.op, DimShuffle)
-                        and list(x_in.owner.inputs[0].type.broadcastable) == [False]
-                    ):
-                        # cut out the DimShuffle that was broadcasting a vector
-                        vectors.append(x_in.owner.inputs[0])
-                    else:
-                        # insert an extra DimShuffle to correct the old one
-                        vectors.append(DimShuffle((True, False), (1,))(x_in))
-                else:
-                    non_vectors.append(x_in)
-            # If all the inputs were vectors or broadcasted vectors,
-            # we broadcast one of them to be used as a matrix
-            if len(non_vectors) == 0:
-                assert len(vectors) > 0  # we should have at least 1 input...
-                promoted_vector = vectors.pop()
-                non_vectors.append(shape_padleft(promoted_vector))
-            assert non_vectors  # not empty
-            if vectors:
-                # we're in business...
-                if len(vectors) > 1:
-                    vector_sum = add(*vectors)
-                    copy_stack_trace(x_in, vector_sum)
-                else:
-                    vector_sum = vectors[0]
-                if len(non_vectors) > 1:
-                    non_vector_sum = add(*non_vectors)
-                    copy_stack_trace(x_in, non_vector_sum)
-                else:
-                    non_vector_sum = non_vectors[0]
-                try:
-                    sm_bias = softmax_with_bias(non_vector_sum, vector_sum)
-                    copy_stack_trace(node.outputs[0], sm_bias)
-                except Exception:
-                    # if our arguments have the wrong types, then
-                    # forget about it
-                    return
-                out_type = node.outputs[0].type
-                if (
-                    out_type.dtype == sm_bias.type.dtype
-                    and out_type.broadcastable == sm_bias.type.broadcastable
-                ):
-                    # This condition is not always true. See the test
-                    # nnet/tests/test_basic.py:T_SoftmaxWithBias.test_broadcast
-                    return [sm_bias]
-def softmax_simplifier(numerators, denominators):
-    for numerator in list(numerators):
-        if not numerator.type.dtype.startswith("float"):
-            continue
-        if not (numerator.owner and numerator.owner.op == exp):
-            continue
-        matching_denom = None
-        for denominator in denominators:
-            # Division with dimshuffle
-            if denominator.owner and isinstance(denominator.owner.op, DimShuffle):
-                ds_order = denominator.owner.op.new_order
-                # Check that at most only one dimension is being reintroduced by
-                # a dimshuffle. The cases where all dimensions are reintroduced
-                # after a complete sum reduction end up in the else branch
-                if ds_order.count("x") != 1:
-                    continue
-                # Check that dimshuffle does not change order of original dims
-                ds_order_without_x = tuple(dim for dim in ds_order if dim != "x")
-                if tuple(sorted(ds_order_without_x)) != ds_order_without_x:
-                    continue
-                new_dim = ds_order.index("x")
-                z = denominator.owner.inputs[0]
-                if z.owner and isinstance(z.owner.op, Sum):
-                    sum_axis = z.owner.op.axis
-                    # Check that reintroduced dim was the one reduced
-                    if (
-                        (sum_axis is not None)
-                        and (len(sum_axis) == 1)
-                        and (sum_axis[0] == new_dim)
-                    ):
-                        if z.owner.inputs[0] is numerator:
-                            (sum_axis,) = sum_axis
-                            matching_denom = denominator
-                            break
-            # Division without dimshuffle
-            else:
-                z = denominator
-                if z.owner and isinstance(z.owner.op, Sum):
-                    sum_axis = z.owner.op.axis
-                    # Filter out partial summations over more than one axis
-                    # The cases where all axis of summation are explicitly given
-                    # as in `sum(matrix, axis=(0, 1))` are eventually rewritten
-                    # to `sum(matrix)` and this branch is not a blocker
-                    if sum_axis is not None and len(sum_axis) != 1:
-                        continue
-                    if z.owner.inputs[0] is numerator:
-                        if sum_axis is not None:
-                            (sum_axis,) = sum_axis
-                        matching_denom = denominator
-                        break
-        if matching_denom:
-            softmax = Softmax(axis=sum_axis)(numerator.owner.inputs[0])
-            copy_stack_trace(numerator, softmax)
-            numerators.remove(numerator)
-            denominators.remove(matching_denom)
-            numerators.append(softmax)
-    return numerators, denominators
-local_mul_canonizer.add_simplifier(softmax_simplifier, "softmax_simplifier")
-class CrossentropySoftmaxArgmax1HotWithBias(COp):
-    """
-    A special compound L{Op} for the output of neural-net classifiers.
-    Parameters
-    ----------
-    x : a matrix of floats (32 or 64)
-    b : a [row] vector of floats (32 or 64), length is number of cols in x
-    y_idx : a [column] vector of int (32 or 64), length is number of rows in x
-    Returns
-    -------
-    object
-        row-wise NLL, softmax(x+b), row-wise argmax of (x+b).
-    @precondition: every entry in y_idx is a valid (non-negative)
-                   column index into x
-    This L{Op} has three outputs:
-     - KL(softmax(x+b), y)
-     - softmax(x+b)
-     - argmax(x+b)
-    softmax(x[i]) is the i'th distribution over len(x[i]) options
-    argmax(x) is the index of x's greatest element
-    y_idx[i] is an integer index, encoding a 1-hot distribution.
-    In practice, when we are trying to do classification, we have one row in x
-    and y_idx per example, and y[i] is the index of the (correct) class of the
-    i'th example.
-    """
-    nin = 3
-    nout = 3
-    __props__ = ()
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def make_node(self, x, b, y_idx):
-        x = at.as_tensor_variable(x)
-        b = at.as_tensor_variable(b)
-        y_idx = at.as_tensor_variable(y_idx)
-        if x.type.ndim != 2 or x.type.dtype not in float_dtypes:
-            raise ValueError("x must be 2-d tensor of floats", x.type)
-        if b.type.ndim != 1 or x.type.dtype not in float_dtypes:
-            raise ValueError("b must be 1-d tensor of floats", b.type)
-        if y_idx.type.ndim != 1 or y_idx.type.dtype not in discrete_dtypes:
-            raise ValueError("y_idx must be 1-d tensor of [u]ints", y_idx.type)
-        #       TODO: Is this correct? It used to be y, not y_idx
-        out_shape = tuple(1 if s == 1 else None for s in y_idx.type.shape)
-        nll = TensorType(x.type.dtype, shape=out_shape).make_variable()
-        sm = x.type()
-        am = y_idx.type()
-        return Apply(self, [x, b, y_idx], [nll, sm, am])
-    def perform(self, node, input_storage, output_storage):
-        """
-        The math, where x is an input vector, and t is a target index:
-            softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j]))
-            nll(x,t) = -log(softmax(x)[t])
-        We compute this by subtracting off the max of x. This avoids
-        numerical instability.
-            m = max_j x[j]
-            softmax(x)[i] = exp(x[i] -m) / sum_j(exp(x[j] - m))
-            nll = -log(exp(x[t] -m) / sum_j(exp(x[j] - m)))
-                = -x[t] + m + log( sum_j(exp(x[j] - m)))
-        """
-        x, b, y_idx = input_storage
-        if b.shape[0] != x.shape[1]:
-            raise ValueError("b must have same number of columns as x")
-        if y_idx.shape[0] != x.shape[0]:
-            raise ValueError("y_idx must have same number of rows as x")
-        if any(y_idx < 0):
-            raise ValueError("y_i value out of bounds")
-        sm = np.zeros_like(x)  # softmax
-        nll = np.zeros(
-            x.shape[0], dtype=node.outputs[0].type.dtype
-        )  # nll(y | softmax(x))
-        am = np.zeros_like(y_idx)
-        for i in range(sm.shape[0]):
-            # add the bias vector to the i'th row of x
-            row = x[i] + b
-            # get the maximum value of i'th row for numerically safe
-            # softmax / nll
-            am[i] = np.argmax(row)
-            m = row[am[i]]
-            # compute the unnormalized softmax, and normalization constant
-            sm[i] = np.exp(row - m)
-            sum_j = np.sum(sm[i])  # sum_j(exp(x[j] - m))
-            # normalized our softmax
-            sm[i] *= 1.0 / sum_j
-            # store the nll
-            nll[i] = -row[y_idx[i]] + m + np.log(sum_j)
-        output_storage[0][0] = nll
-        output_storage[1][0] = sm
-        output_storage[2][0] = am
-    def infer_shape(self, fgraph, node, shapes):
-        x_shp, b_shp, idx_shp = shapes
-        nll_shp = (x_shp[0],)
-        sm_shp = x_shp
-        am_shp = idx_shp
-        return [nll_shp, sm_shp, am_shp]
-    def connection_pattern(self, node):
-        return [
-            [True, True, True],  # x
-            [True, True, True],  # b
-            [False, False, True],
-        ]  # y_idx
-    def grad(self, inp, grads):
-        x, b, y_idx = inp
-        g_nll, g_sm, g_am = grads
-        dx_terms = []
-        db_terms = []
-        d_idx_terms = []
-        if not isinstance(g_nll.type, DisconnectedType):
-            nll, sm = crossentropy_softmax_1hot_with_bias(x, b, y_idx)
-            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, sm, y_idx)
-            db = at_sum(dx, axis=[0])
-            dx_terms.append(dx)
-            db_terms.append(db)
-        if not isinstance(g_sm.type, DisconnectedType):
-            dx, db = softmax_with_bias.L_op((x, b), [softmax_with_bias(x, b)], (g_sm,))
-            dx_terms.append(dx)
-            db_terms.append(db)
-        if not isinstance(g_am.type, DisconnectedType):
-            dx_terms.append(x.zeros_like())
-            db_terms.append(b.zeros_like())
-            d_idx_terms.append(y_idx.zeros_like())
-        def fancy_sum(terms):
-            if len(terms) == 0:
-                return DisconnectedType()()
-            rval = terms[0]
-            for term in terms[1:]:
-                rval = rval + term
-            return rval
-        return [fancy_sum(terms) for terms in [dx_terms, db_terms, d_idx_terms]]
-    def c_headers(self, **kwargs):
-        return ["<iostream>", "<cmath>"]
-    @staticmethod
-    def c_code_template(dtype):
-        # this implementation was lifted from
-        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
-        # TODO: put this into a templated function, in the support code
-        # TODO: declare the max of each row as an Op output
-        # TODO: set error messages for failures in this code
-        # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
-        (
-            init_decl,
-            begin_row_loop,
-            inside_row_loop,
-            end_row_loop,
-        ) = SoftmaxWithBias.c_code_template(dtype)
-        return (
-            init_decl,
-            """
-        if (PyArray_NDIM(%(y_idx)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
-            %(fail)s;
-        }
-        if (PyArray_DIMS(%(x)s)[0] != PyArray_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_Format(PyExc_ValueError,
-                "number of rows in x (%%ld) does not match length of y (%%ld)",
-                (long int)PyArray_DIMS(%(x)s)[0],
-                (long int)PyArray_DIMS(%(y_idx)s)[0]);
-            %(fail)s;
-        }
-        if ((NULL == %(nll)s) //initial condition
-            || (PyArray_DIMS(%(nll)s)[0] != PyArray_DIMS(%(y_idx)s)[0]))
-        {
-            if (NULL != %(nll)s) Py_XDECREF(%(nll)s);
-            %(nll)s = (PyArrayObject*)PyArray_SimpleNew(1,
-                PyArray_DIMS(%(y_idx)s), PyArray_TYPE(%(x)s));
-            if(!%(nll)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                     "failed to alloc nll output");
-                %(fail)s;
-            }
-        }
-        if ((NULL == %(am)s)
-            || (PyArray_DIMS(%(am)s)[0] != PyArray_DIMS(%(y_idx)s)[0]))
-        {
-            Py_XDECREF(%(am)s);
-            %(am)s = (PyArrayObject*) PyArray_SimpleNew(1,
-                PyArray_DIMS(%(y_idx)s), PyArray_TYPE(%(y_idx)s));
-            if(!%(am)s)
-            {
-                PyErr_SetString(PyExc_MemoryError,
-                     "failed to alloc am output");
-                %(fail)s;
-            }
-        }
-                """,
-            begin_row_loop,
-            """
-            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
-            dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_BYTES(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i);
-            %(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_BYTES(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i);
-                """,
-            inside_row_loop,
-            """
-            if ((y_i >= PyArray_DIMS(%(x)s)[1]) || (y_i < 0))
-            {
-                PyErr_SetString(PyExc_ValueError, "y_i value out of bounds");
-                %(fail)s;
-            }
-            nll_i[0] = - x_i[y_i*Sx]
-                       - b_i[y_i*Sb]
-                       + row_max
-                       + log(sum);
-            am_i[0] = row_max_j;
-                """,
-            end_row_loop,
-        )
-    def c_code_cache_version(self):
-        return (5,) + SoftmaxWithBias.c_code_cache_version()
-    def c_code(self, node, name, inp, out, sub):
-        x, b, y_idx = inp
-        nll, sm, am = out
-        y_idx_type = node.inputs[2].type.dtype_specs()[1]
-        am_type = y_idx_type
-        dtype = node.inputs[0].type.dtype_specs()[1]
-        code_template = "".join(self.c_code_template(dtype))
-        return code_template % dict(locals(), **sub)
-class CrossentropySoftmax1HotWithBiasDx(COp):
-    """
-    Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op.
-    """
-    nin = 3
-    nout = 1
-    __props__ = ()
-    def make_node(self, dy, sm, y_idx, **kwargs):
-        dy = at.as_tensor_variable(dy)
-        sm = at.as_tensor_variable(sm)
-        y_idx = at.as_tensor_variable(y_idx)
-        if dy.type.ndim > 1 or dy.type.dtype not in float_dtypes:
-            raise ValueError("dy must be {0,1}-d tensor of floats", dy.type)
-        if sm.type.ndim != 2 or sm.type.dtype not in float_dtypes:
-            raise ValueError("sm must be 2-d tensor of floats", sm.type)
-        if y_idx.type.ndim != 1 or y_idx.type.dtype not in discrete_dtypes:
-            raise ValueError("y_idx must be 1-d tensor of [u]ints", y_idx.type)
-        return Apply(self, [dy, sm, y_idx], [sm.type()])
-    def perform(self, node, input_storage, output_storage):
-        dy, sm, y_idx = input_storage
-        if any(y_idx < 0):
-            raise ValueError("y_i value out of bounds")
-        dx = np.zeros_like(sm)
-        if dy.ndim == 0:
-            dy = dy[None]
-        incr = int(dy.shape[0] > 1)
-        for i in range(sm.shape[0]):
-            dy_i = dy[i * incr]
-            dx[i] = dy_i * sm[i]  # vector scale
-            dx[i, y_idx[i]] -= dy_i  # scalar decrement
-        output_storage[0][0] = dx
-    def infer_shape(self, fgraph, node, shapes):
-        return [shapes[1]]
-    def grad(self, inp, grads):
-        dy, sm, y_idx = inp
-        (g_dx,) = grads
-        # TODO: currently we do not compute the gradient w.r.t. dy, because
-        # advanced indexing is not working yet. When it works, do it to avoid
-        # potentially misleading behavior in gradient computations! (although
-        # typically we should not need the gradient w.r.t. dy).
-        y_idx_range = at.arange(y_idx.shape[0])
-        g_dy = at_sum(
-            g_dx * AdvancedIncSubtensor()(sm, at.fill(dy, -1), y_idx_range, y_idx),
-            axis=1,
-        )
-        g_sm = dy.dimshuffle(0, "x") * g_dx
-        g_y_idx = grad_not_implemented(self, 2, y_idx)
-        return [g_dy, g_sm, g_y_idx]
-    def c_code_cache_version(self):
-        return (6,)
-    def c_code(self, node, name, inp, out, sub):
-        dnll, sm, y_idx = inp
-        (dx,) = out
-        y_idx_type = node.inputs[2].type.dtype_specs()[1]
-        return """
-        if ((PyArray_TYPE(%(dnll)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(dnll)s) != NPY_FLOAT))
-        {
-            PyErr_SetString(PyExc_TypeError,
-                 "dnll type should be float32 or float64");
-            %(fail)s;
-        }
-        if ((PyArray_TYPE(%(sm)s) != NPY_DOUBLE) &&
-            (PyArray_TYPE(%(sm)s) != NPY_FLOAT))
-        {
-            PyErr_SetString(PyExc_TypeError,
-                 "sm type should be float32 or float64");
-            %(fail)s;
-        }
-        // new scope because of variable declaration
-        // TODO: proper indentation, but the diff will get messy
-        {
-        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
-        const npy_intp %(dnll)s_dims0 = (PyArray_NDIM(%(dnll)s) > 0 ?
-                                         PyArray_DIMS(%(dnll)s)[0] :
-                                         (npy_intp) 0);
-        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
-        // or a vector with just one element.
-        const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
-                                            PyArray_STRIDES(%(dnll)s)[0] :
-                                            (npy_intp) 0);
-        if ((PyArray_NDIM(%(dnll)s) > 1)
-            || (PyArray_NDIM(%(sm)s) != 2)
-            || (PyArray_NDIM(%(y_idx)s) != 1))
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if (%(dnll)s_dims0 != PyArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "dnll.shape[0] (%%ld) != sm.shape[0] (%%ld)",
-                         (long int)%(dnll)s_dims0,
-                         (long int)PyArray_DIMS(%(sm)s)[0]);
-            %(fail)s;
-        }
-        if (%(dnll)s_dims0 != PyArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "dnll.shape[0] (%%ld) != y_idx.shape[0] (%%ld)",
-                         (long int)%(dnll)s_dims0,
-                         (long int)PyArray_DIMS(%(y_idx)s)[0]);
-            %(fail)s;
-        }
-        if (PyArray_DIMS(%(sm)s)[0] !=
-            PyArray_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "sm.shape[0] != y_idx.shape[0]");
-            %(fail)s;
-        }
-        if ((NULL == %(dx)s)
-            || (PyArray_DIMS(%(dx)s)[0] != PyArray_DIMS(%(sm)s)[0])
-            || (PyArray_DIMS(%(dx)s)[1] != PyArray_DIMS(%(sm)s)[1]))
-        {
-            if (NULL != %(dx)s) Py_XDECREF(%(dx)s);
-            %(dx)s = (PyArrayObject*) PyArray_SimpleNew(2,
-                                                        PyArray_DIMS(%(sm)s),
-                                                        PyArray_TYPE(%(sm)s));
-            if(!%(dx)s) {
-                PyErr_SetString(PyExc_MemoryError,
-                     "failed to alloc dx output");
-                %(fail)s
-            }
-        }
-        for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
-        {
-            const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + %(dnll)s_strides0 * i))[0];
-            const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
-            const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
-            npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
-            dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
-            npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);
-            for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j)
-            {
-                dx_i[j * Sdx] = dnll_i * sm_i[j * Ssm];
-            }
-            if (y_i >= PyArray_DIMS(%(dx)s)[1] || (y_i < 0))
-            {
-                PyErr_SetString(PyExc_ValueError, "y_i >= dx dimensions[1] or y_i < 0.");
-                %(fail)s;
-            }
-            dx_i[y_i * Sdx] -= dnll_i;
-        }
-        }
-        """ % dict(
-            locals(), **sub
-        )
-crossentropy_softmax_argmax_1hot_with_bias = CrossentropySoftmaxArgmax1HotWithBias()
-crossentropy_softmax_1hot_with_bias_dx = CrossentropySoftmax1HotWithBiasDx()
-def crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs):
-    return crossentropy_softmax_argmax_1hot_with_bias(x, b, y_idx, **kwargs)[0:2]
-def crossentropy_softmax_1hot(x, y_idx, **kwargs):
-    b = at.zeros_like(x[0, :])
-    return crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
-def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
-    """
-    Returns
-    -------
-    object
-        The cross-entropy, the softmax output, the max probability,
-        and the argmax index.
-    TODO: Since we are recomputing the argmax,
-           we might as well assert that it is correct.
-    TODO: Make this entire function is
-    unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return
-    the appropriate information (i.e. the max probability)?
-    """
-    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
-    (max_pr, argmax) = max_and_argmax(softmax, axis=-1)
-    return (xent, softmax, max_pr, argmax)
-def crossentropy_softmax_max_and_argmax_1hot(x, y_idx, **kwargs):
-    b = at.zeros_like(x[0, :])
-    return crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs)
-class CrossentropyCategorical1HotGrad(Op):
-    __props__ = ()
-    def make_node(self, g_y, coding_dist, true_one_of_n):
-        return Apply(self, [g_y, coding_dist, true_one_of_n], [coding_dist.type()])
-    def perform(self, node, inp, out):
-        g_y, coding_dist, true_one_of_n = inp
-        (g_coding_strg,) = out
-        g_coding = np.zeros_like(coding_dist)
-        for i in range(len(g_y)):
-            g_coding[i, true_one_of_n[i]] = -g_y[i] / coding_dist[i, true_one_of_n[i]]
-        g_coding_strg[0] = g_coding
-    def infer_shape(self, fgraph, node, in_shapes):
-        return [in_shapes[1]]
-crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad()
-class CrossentropyCategorical1Hot(Op):
-    r"""
-    Compute the cross entropy between a coding distribution and
-    a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0].
-    .. math::
-        y[i] = - \log(coding_dist[i, one_of_n[i])
-    Notes
-    -----
-    In the case that the coding distribution is the output of a
-    softmax, an application of this Op will probably be optimized
-    away in favour of one with a C implementation.
-    """
-    __props__ = ()
-    def make_node(self, coding_dist, true_one_of_n):
-        """
-        Parameters
-        ----------
-        coding_dist : dense matrix
-        true_one_of_n : lvector
-        Returns
-        -------
-        dvector
-        """
-        _coding_dist = at.as_tensor_variable(coding_dist)
-        _true_one_of_n = at.as_tensor_variable(true_one_of_n)
-        if _coding_dist.type.ndim != 2:
-            raise TypeError("Matrix required for argument `coding_dist`")
-        if not (
-            _true_one_of_n.type.ndim == 1
-            and _true_one_of_n.type.dtype in integer_dtypes
-        ):
-            raise TypeError("Integer vector required for argument `true_one_of_n`")
-        return Apply(
-            self,
-            [_coding_dist, _true_one_of_n],
-            [TensorType(dtype=_coding_dist.dtype, shape=(None,))()],
-        )
-    def perform(self, node, inp, out):
-        coding, one_of_n = inp
-        (y_out,) = out
-        y = np.zeros_like(coding[:, 0])
-        for i in range(len(y)):
-            y[i] = -np.log(coding[i, one_of_n[i]])
-        y_out[0] = y
-    def infer_shape(self, fgraph, node, in_shapes):
-        return [(in_shapes[0][0],)]
-    def grad(self, inp, grads):
-        coding, one_of_n = inp
-        (g_y,) = grads
-        return [
-            crossentropy_categorical_1hot_grad(g_y, coding, one_of_n),
-            grad_not_implemented(self, 1, one_of_n),
-        ]
-crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
-@register_stabilize("fast_compile")
-@register_specialize("fast_compile")
-@graph_rewriter
-def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
-    def search_make_one_sub():
-        for node in fgraph.toposort():
-            if node.op == crossentropy_categorical_1hot:
-                (nll,) = node.outputs
-                sm, one_of_n = node.inputs
-                if sm.owner and sm.owner.op == softmax_with_bias:
-                    x, b = sm.owner.inputs
-                    (
-                        new_nll,
-                        new_sm,
-                        new_am,
-                    ) = crossentropy_softmax_argmax_1hot_with_bias(x, b, one_of_n)
-                    fgraph.replace_all_validate(
-                        [(nll, new_nll), (sm, new_sm)],
-                        reason="crossentropy_to_crossentropy_with_softmax_with_bias",
-                    )
-                    return True
-        return False
-    while search_make_one_sub():
-        pass
-    return
-@graph_rewriter
-def crossentropy_to_crossentropy_with_softmax(fgraph):
-    """
-    This is a stabilization rewrite that is more general than
-    `crossentropy_to_crossentropy_with_softmax_with_bias`.
-    Notes
-    -----
-    It must be executed after `local_softmax_with_bias` during the
-    specialization passes.
-    """
-    def search_make_one_sub():
-        for node in fgraph.toposort():
-            if node.op == crossentropy_categorical_1hot:
-                (nll,) = node.outputs
-                sm, one_of_n = node.inputs
-                if sm.owner and sm.owner.op == softmax_legacy and sm.ndim == 2:
-                    (x,) = sm.owner.inputs
-                    (
-                        new_nll,
-                        new_sm,
-                        new_am,
-                    ) = crossentropy_softmax_argmax_1hot_with_bias(
-                        x, at.zeros_like(x[0]), one_of_n
-                    )
-                    fgraph.replace_all_validate(
-                        [(nll, new_nll), (sm, new_sm)],
-                        reason="crossentropy_to_crossentropy_with_softmax",
-                    )
-                    return True
-                if sm.owner and sm.owner.op == softmax_with_bias:
-                    x, b = sm.owner.inputs
-                    (
-                        new_nll,
-                        new_sm,
-                        new_am,
-                    ) = crossentropy_softmax_argmax_1hot_with_bias(x, b, one_of_n)
-                    fgraph.replace_all_validate(
-                        [(nll, new_nll), (sm, new_sm)],
-                        reason="crossentropy_to_crossentropy_with_softmax",
-                    )
-                    return True
-        return False
-    while search_make_one_sub():
-        pass
-    return
-optdb.register(
-    "crossentropy_to_crossentropy_with_softmax",
-    crossentropy_to_crossentropy_with_softmax,
-    "fast_run",
-    "xent",
-    "fast_compile",
-    position=2.01,
-)
-@register_specialize(
-    "fast_compile", "local_crossentropy_to_crossentropy_with_softmax_grad"
-)  # old name
-@node_rewriter([softmax_grad_legacy])
-def local_softmax_grad_to_crossentropy_with_softmax_grad(fgraph, node):
-    if node.op == softmax_grad_legacy and node.inputs[1].ndim == 2:
-        g_coding_dist, coding_dist = node.inputs
-        if (
-            g_coding_dist.owner
-            and g_coding_dist.owner.op == crossentropy_categorical_1hot_grad
-        ):
-            g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs
-            dx = crossentropy_softmax_1hot_with_bias_dx(
-                g_nll, coding_dist, true_one_of_n
-            )
-            copy_stack_trace(node.outputs[0], dx)
-            return [dx]
-@register_specialize("fast_compile")
-@node_rewriter([MaxAndArgmax])
-def local_argmax_pushdown(fgraph, node):
-    if (
-        isinstance(node.op, MaxAndArgmax)
-        and node.inputs[0].owner
-        and len(fgraph.clients[node.outputs[0]]) == 0
-    ):
-        x_max, x_argmax = node.outputs
-        x = node.inputs[0]
-        axis = node.op.get_params(node)
-        # TODO: Make a list/set of monotonic ops...
-        if x.owner and (
-            x.owner.op
-            in (
-                softplus,
-                exp,
-                log,
-                tanh,
-                sigmoid,
-            )
-            or isinstance(x.owner.op, Softmax)
-        ):
-            (pre_x,) = x.owner.inputs
-            ret = max_and_argmax(pre_x, axis)
-            copy_stack_trace(x_max, ret)
-            return ret
-        if x.owner and x.owner.op == softmax_with_bias:
-            pre_x, pre_bias = x.owner.inputs
-            ret = max_and_argmax(
-                pre_x + DimShuffle(pre_bias.broadcastable, ("x", 0))(pre_bias),
-                axis,
-            )
-            # copy both stack traces
-            copy_stack_trace(x_max, ret)
-            return ret
-def _check_rows_is_arange_len_labels(fgraph, rows, labels):
-    """Check that `rows` is the same node as `at.arange(labels.shape[0])`.
-    Also considers the case where `labels.shape[0]` is constant and equal to 1,
-    and `at.arange(labels.shape[0])` has been constant-folded into
-    0.
-    """
-    shape_of = None
-    if hasattr(fgraph, "shape_feature"):
-        shape_of = fgraph.shape_feature.shape_of
-        # TODO: consider cases where shape_of[labels] is constant, and
-        # has a value different from 1.
-        # This case is harder, as _is_const only accepts a scalar value
-        # as second argument, so checking for
-        # _is_const(rows, numpy.arange(...)) does not work for the moment.
-        if len(shape_of[labels]) == 1 and _is_const(shape_of[labels][0], 1):
-            return _is_const(rows, 0)
-    if rows.owner and isinstance(rows.owner.op, ARange):
-        start, stop, step = rows.owner.inputs
-        if getattr(start, "data", None) != 0:  # constants will have data
-            return False
-        if getattr(step, "data", None) != 1:  # constant step will have data
-            return False
-        if not stop.owner:
-            return False
-        # Not sure if that case happens any more after the introduction of
-        # ShapeOptimizer, but we keep it if ShapeOptimizer is not present
-        if isinstance(stop.owner.op, DimShuffle) and stop.owner.op.new_order == ():
-            shape_var = stop.owner.inputs[0]
-            if shape_var.owner and isinstance(shape_var.owner.op, Shape):
-                return shape_var.owner.inputs[0] is labels
-        elif shape_of:
-            shape_of = fgraph.shape_feature.shape_of
-            return shape_of[labels][0] is stop
-def _is_const(z, val, approx=False):
-    try:
-        maybe = at.get_scalar_constant_value(z)
-    except NotScalarConstantError:
-        return False
-    if approx:
-        return np.allclose(maybe, val)
-    else:
-        return np.all(maybe == val)
-@register_specialize("fast_compile")
-@node_rewriter([AdvancedSubtensor, log])
-def local_advanced_indexing_crossentropy_onehot(fgraph, node):
-    log_op = None
-    sm = None
-    # First case: log(softmax(x))[rows, labels]
-    if isinstance(node.op, AdvancedSubtensor):
-        try:
-            log_op, rows, labels = node.inputs
-        except Exception:
-            pass
-        if log_op and log_op.owner and log_op.owner.op == log:
-            sm = log_op.owner.inputs[0]
-    # Second case: log(softmax(x)[rows, labels])
-    elif node.op == log:
-        pre_log = node.inputs[0].owner
-        if pre_log and isinstance(pre_log.op, AdvancedSubtensor):
-            try:
-                sm, rows, labels = pre_log.inputs
-            except Exception:
-                pass
-    if (
-        sm is not None
-        and sm.owner
-        and sm.owner.op in (softmax_legacy, softmax_with_bias)
-        and sm.ndim == 2
-    ):
-        sm_w_bias = local_softmax_with_bias.transform(fgraph, sm.owner)
-        if sm_w_bias:
-            assert sm_w_bias[0].owner.op == softmax_with_bias
-            x_var, b_var = sm_w_bias[0].owner.inputs
-        else:
-            x_var = sm.owner.inputs[0]
-            b_var = at.zeros_like(x_var[0])
-        # Check that rows == arange(labels.shape[0])
-        if _check_rows_is_arange_len_labels(fgraph, rows, labels):
-            if labels.ndim == 1 and x_var.ndim == 2:
-                minus_ret = crossentropy_softmax_argmax_1hot_with_bias(
-                    x_var, b_var, labels
-                )[0]
-                ret = -minus_ret
-                copy_stack_trace(node.outputs[0], [minus_ret, ret])
-                return [ret]
-@register_specialize("fast_compile")
-@node_rewriter([softmax_grad_legacy])
-def local_advanced_indexing_crossentropy_onehot_grad(fgraph, node):
-    if not (node.op == softmax_grad_legacy and node.inputs[1].ndim == 2):
-        return
-    sm = None
-    try:
-        d_sm, sm = node.inputs
-    except Exception:
-        return
-    if (
-        (sm is not None)
-        and sm.owner
-        and (sm.owner.op in (softmax_legacy, softmax_with_bias))
-        and sm.ndim == 2
-    ):
-        sm_w_bias = local_softmax_with_bias.transform(fgraph, sm.owner)
-        if sm_w_bias:
-            assert sm_w_bias[0].owner.op == softmax_with_bias
-            x_var, b_var = sm_w_bias[0].owner.inputs
-        else:
-            x_var = sm.owner.inputs[0]
-    else:
-        return
-    # Two cases are supported:
-    # 1. AdvancedIncSubtensor(
-    #           zeros_like(softmax(x)),
-    #           -out_grad / AdvancedSubtensor(softmax(x), arange(y.shape[0]), y),
-    #           arange(y.shape[0]),
-    #           y)
-    #   which arises from the gradient of log(softmax(x)[arange(y.shape[0]), y])
-    #
-    # 2. AdvancedIncSubtensor(
-    #           zeros_like(log(softmax(x))),
-    #           -out_grad,
-    #           arange(y.shape[0]),
-    #           y)
-    #           / softmax(x)
-    #   which arises from the gradient of log(softmax(x))[arange(y.shape[0]), y]
-    #
-    # out_grad represents the gradient of the (final) cost wrt the output.
-    #
-    # N.B. Regarding clients -- This substitution is important for numerical stability, so we
-    # perform the substitution even when intermediate values have multiple clients.
-    #
-    # First case.
-    # After the check for AdvancedIncSubtensor, if anything does not fit with
-    # the formula above, there's no way to fit it with the the second case,
-    # so we return immediately.
-    if d_sm.owner and isinstance(d_sm.owner.op, AdvancedIncSubtensor):
-        try:
-            z, incr, rows, labels = d_sm.owner.inputs
-        except Exception:
-            return
-        # Check that z == zeros_like(softmax(x))
-        # We know z has the right size because z has the same size as d_sm,
-        # and d_sm and sm are both inputs of softmax_grad (so they have
-        # the same size).
-        if not _is_const(z, 0):
-            return
-        # In the base case (output gradient = 1), incr is -1./sm[arange(len(y)), y]
-        # Here, we are looking for the AdvancedSubtensor term (sm[arange(len(y)), y]),
-        # and constructing out_grad by incorporating the other terms.
-        # out_grad will be constructed in 3 steps as follow:
-        # out_grad = +/- 1. (according to sign)
-        # out_grad *= -numerator
-        # out_grad /= denominator
-        # Then, if out_grad is a scalar, it will be allocated as a vector
-        adv_subtensor = None
-        out_grad = 1.0
-        # If there's a 'minus' sign before the whole expression, put it in
-        # out_grad and iterate
-        if incr.owner and incr.owner.op == neg:
-            out_grad = -out_grad
-            incr = incr.owner.inputs[0]
-        if incr.owner and incr.owner.op == true_div:
-            num, denom = incr.owner.inputs
-            # set out_grad according to the numerator, it may be divided later
-            # num should be a vector or a scalar
-            if num.ndim == 1 or all(num.broadcastable):
-                out_grad *= -num
-            else:
-                return
-            if not denom.owner:
-                return
-            if isinstance(denom.owner.op, AdvancedSubtensor):
-                # Base case
-                adv_subtensor = denom
-                # out_grad /= 1.
-            elif denom.owner.op == mul:
-                # Try to find the AdvancedSubtensor node mentioned above,
-                # and the output gradient
-                for i, input in enumerate(denom.owner.inputs):
-                    if input.owner and isinstance(input.owner.op, AdvancedSubtensor):
-                        other_inputs = [
-                            in_ for (j, in_) in enumerate(denom.owner.inputs) if j != i
-                        ]
-                        if len(other_inputs) == 1:
-                            rest = other_inputs[0]
-                        else:
-                            rest = mul(*[other_inputs])
-                        # Check that rest is a vector or a scalar
-                        if rest.ndim == 1 or all(rest.broadcastable):
-                            adv_subtensor = input
-                            out_grad /= rest
-                            break
-            else:
-                return
-            # The output gradient needs to be a vector
-            out_grad = at.fill(x_var[:, 0], out_grad)
-            if adv_subtensor is not None:
-                try:
-                    maybe_sm, maybe_rows, maybe_labels = adv_subtensor.owner.inputs
-                except Exception:
-                    return
-                if not (
-                    maybe_sm is sm and maybe_rows is rows and maybe_labels is labels
-                ):
-                    return
-                # else: OK
-            else:
-                return
-        else:
-            return
-        # Check that rows is arange(labels.shape[0])
-        if not _check_rows_is_arange_len_labels(fgraph, rows, labels):
-            return
-        # else, arguments of AdvancedIncSubtensor are OK,
-        # it was really case 1.
-    # Second case
-    elif d_sm.owner and d_sm.owner.op == true_div:
-        # we're looking for
-        # AdvIncSubtensor(zeros, grad_nll, arange(len(y)), y) / softmax
-        try:
-            num, denom = d_sm.owner.inputs
-        except Exception:
-            return
-        if denom != sm:
-            return
-        # Check the numerator (AdvancedIncSubtensor)
-        if num.owner and isinstance(num.owner.op, AdvancedIncSubtensor):
-            try:
-                z, incr, rows, labels = num.owner.inputs
-            except Exception:
-                return
-            # Check z is zeros_like(log(sm))
-            if not _is_const(z, 0):
-                return
-            if z.broadcastable not in [(False, False), (True, False)]:
-                return
-            # here we know that we are incrementing a matrix of zeros
-            # (or a broadcasted vector).
-            # Since d_sm and sm are the inputs of softmax_grad,
-            # if the graph is valid, they have the same shape, so we
-            # also know that z has the right shape.
-            if incr.ndim != 1 or incr.dtype not in float_dtypes:
-                return
-            # here we know that we are incrementing some part of
-            # matrix z by a vector
-            # unless the user has taken care to mark that the data and
-            # labels have the same number of rows, we cannot be sure
-            # here that len(y) == len(z) However, in the common case
-            # that these are predictions and labels it is true.  We
-            # leave it to the Op to crash (and the user to complain)
-            # if this assumption is ever not true.
-            out_grad = -incr
-            # Check that rows is arange(labels.shape[0])
-            if not _check_rows_is_arange_len_labels(fgraph, rows, labels):
-                return
-            # else, arguments of AdvancedIncSubtensor are OK
-        else:
-            return
-        # numerator and denominator are OK,
-        # it was really case 2.
-    else:
-        return
-    # Dimension check before substitution
-    if labels.ndim == 1 and x_var.ndim == 2:
-        ret = crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)
-        # The stack trace is not added to output_grad, sm and labels at
-        # the moment but may need to be added at a future point
-        copy_stack_trace(node.outputs[0], ret)
-        return [ret]
-    else:
-        return
-@register_specialize("fast_compile")
-@node_rewriter([softmax_with_bias])
-def graph_merge_softmax_with_crossentropy_softmax(fgraph, node):
-    if node.op == softmax_with_bias:
-        x, b = node.inputs
-        for x_client in fgraph.clients[x]:
-            if x_client[0].op == crossentropy_softmax_argmax_1hot_with_bias:
-                big_client = x_client[0]
-                if big_client in [b_client[0] for b_client in fgraph.clients[b]]:
-                    xx, bb, ll = big_client.inputs
-                    mergeable_client = big_client.op(x, b, ll)
-                    copy_stack_trace(node.outputs[0], mergeable_client[1])
-                    return [mergeable_client[1]]
-@register_specialize
-@register_stabilize
-@register_canonicalize
-@node_rewriter([CrossentropySoftmax1HotWithBiasDx])
-def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(fgraph, node):
-    """
-    Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
-    an `alloc` of a scalar variable or one that has either broadcastable or
-    matching dimensions with the output variable, by one that skips the
-    intermediate `alloc`.
-    """
-    if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
-        dy, sm, y_idx = node.inputs
-        # Those cases are directly handled by the internal broadcasting of the
-        # `CrossentropySoftmax1HotWithBiasDx` op.
-        if dy.ndim == 0:
-            return False
-        if dy.ndim == 1 and dy.broadcastable[0]:
-            return False
-        assert dy.ndim == 1
-        if dy.owner is not None and isinstance(dy.owner.op, at.Alloc):
-            # dz is the input of the Alloc op, i.e. at.alloc(dz, <shape>)
-            dz = dy.owner.inputs[0]
-            try:
-                shape_feature = fgraph.shape_feature
-            except AttributeError:
-                # The shape feature may not be available in some mode, but we
-                # need it for this optimization, so don't continue.
-                return False
-            shape_of = shape_feature.shape_of
-            same_shape = shape_feature.same_shape
-            # Build `dz_broad` explicitly to include extra implicit dimensions.
-            dz_broad = (True,) * (dy.ndim - dz.ndim) + dz.broadcastable
-            # If we can infer statically that the shape of `sm` and
-            # `dy` are the same in dimension `k` or the shape of `dy` is equal
-            # to 1 (which triggers the internal broadcasting in
-            # `CrossentropySoftmax1HotWithBiasDx`) we do not need to
-            # check it at runtime.
-            if (
-                dz_broad[0]
-                and not same_shape(sm, dy, dim_x=0, dim_y=0)
-                and shape_of[dy][0] != 1
-            ):
-                # If `dz` is broadcastable, we need to check whether the shapes
-                # of `dy` and `sm` are the same or whether the shape of `dy` is
-                # equal to 1.
-                cond = or_(eq(dy.shape[0], 1), eq(dy.shape[0], sm.shape[0]))
-                msg = "`sm` and `dy` do not have the same shape."
-                dz = Assert(msg)(dz, cond)
-            ret = node.op(dz, sm, y_idx)
-            copy_stack_trace(node.outputs[0], ret)
-            return [ret]
-def binary_crossentropy(output, target):
-    """
-    Compute the crossentropy of binary random variables.
-    Output and target are each expectations of binary random
-    variables; target may be exactly 0 or 1 but output must
-    lie strictly between 0 and 1.
-    Notes
-    -----
-    We could use the x log y op to support output=0 and output=1.
-    The gradient would still be undefined though.
-    We do not sum, crossentropy is computed by component.
-    TODO : Rewrite as a scalar, and then broadcast to tensor.
-    """
-    return -(target * log(output) + (1.0 - target) * log(1.0 - output))
-def sigmoid_binary_crossentropy(output, target):
-    """
-    Compute the cross-entropy of binary random variables.
-    `output` should be real-valued (range (-inf, +inf)); `sigmoid` will be
-    applied to produce a (0, 1) valued input.
-    `target` is assumed to be probabilities in [0, 1].
-    Notes
-    -----
-    Mathematically equivalent to `binary_crossentropy(sigmoid(output), target)`,
-    but with more efficient and numerically stable computation.
-    """
-    def grad(inputs, out_grads):
-        (output, target), (out_grad,) = inputs, out_grads
-        g_output = out_grad * (sigmoid(output) - target)
-        g_target = out_grad * (-output)
-        return [g_output, g_target]
-    inp = [output, target]
-    outp = softplus(-abs(output)) + output * ((output > 0) - target)
-    return pytensor.compile.builders.OpFromGraph(
-        inp,
-        [outp],
-        grad_overrides=grad,
-        inline=True,
-        name="sigmoid_binary_crossentropy",
-    )(*inp)
-def categorical_crossentropy(coding_dist, true_dist):
-    r"""
-    Return the cross-entropy between an approximating distribution and a true
-    distribution.
-    .. warning:: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
-    We ultimately don't want the polymorphism, and will move this function
-    to pylearn.algorithms.cost. The 1hot version will be removed.
-    The length of the documentation here is a form of code smell.
-    The cross entropy between two probability distributions measures the average
-    number of bits needed to identify an event from a set of possibilities, if a
-    coding scheme is used based on a given probability distribution q, rather
-    than the "true" distribution p.
-    Mathematically it is defined as follows:
-    .. math::
-        H(p,q) = - \sum_x p(x) \log(q(x))
-    Parameters
-    ----------
-    coding_dist : a dense matrix
-        Each slice along axis represents one distribution.
-    true_dist : a dense matrix or sparse matrix or integer vector
-        In the case of a matrix argument, each slice along axis represents one
-        distribution. In the case of an integer vector argument, each element
-        represents the position of the '1' in a 1-of-N encoding.
-    Returns
-    -------
-    tensor of rank one-less-than `coding_dist`
-        The cross entropy between each coding and true distribution.
-    Notes
-    -----
-    axis : int
-        The dimension over which each distribution runs
-        (1 for row distributions, 0 for column distributions).
-    """
-    if true_dist.ndim == coding_dist.ndim:
-        return -at_sum(true_dist * log(coding_dist), axis=coding_dist.ndim - 1)
-    elif true_dist.ndim == coding_dist.ndim - 1:
-        return crossentropy_categorical_1hot(coding_dist, true_dist)
-    else:
-        raise TypeError("rank mismatch between coding and true distributions")
-class Prepend_scalar_constant_to_each_row(Op):
-    __props__ = ()
-    def __init__(self, val=0):
-        if isinstance(val, float):
-            val = aes.constant(val)
-        self.val = val
-    def __str__(self):
-        return f"{self.__class__.__name__}{{{self.val}}}"
-    def make_node(self, mat):
-        # check type of input
-        x = at.as_tensor_variable(mat)
-        if mat.type.broadcastable != (False, False):
-            raise TypeError("Expected a matrix as input")
-        y = at.as_tensor_variable(self.val)
-        assert y.ndim == 0
-        if x.type.dtype != y.type.dtype:
-            TypeError("the value to prepend don't have the same type as the matrix")
-        node = Apply(op=self, inputs=[mat], outputs=[mat.type()])
-        return node
-    def perform(self, node, inp, out):
-        (mat,) = inp
-        (output,) = out
-        new_shape = (mat.shape[0], mat.shape[1] + 1)
-        if output[0] is None:
-            output[0] = np.empty(new_shape, dtype=mat.dtype)
-            out = output[0]
-        else:
-            if output[0].shape != new_shape:
-                try:
-                    output[0].resize(new_shape)
-                except Exception:
-                    output[0] = np.empty(new_shape, dtype=mat.dtype)
-            out = output[0]
-        out[:, 0].fill(self.val.data)
-        out[:, 1:] = mat
-    def infer_shape(self, fgraph, node, in_shapes):
-        shp = (in_shapes[0][0], in_shapes[0][1] + 1)
-        return [shp]
-    def grad(self, inp, grads):
-        (mat,) = inp
-        (goutput,) = grads
-        return goutput[:, 1:]
-class Prepend_scalar_to_each_row(Op):
-    __props__ = ()
-    def make_node(self, val, mat):
-        # check type of input
-        x = at.as_tensor_variable(mat)
-        if isinstance(val, float):
-            val = aes.constant(val)
-        if mat.type.broadcastable != (False, False):
-            raise TypeError("Expected a matrix as input")
-        y = at.as_tensor_variable(val)
-        assert y.ndim == 0
-        if x.type.dtype != y.type.dtype:
-            TypeError("the value to prepend don't have the same type as the matrix")
-        node = Apply(op=self, inputs=[val, mat], outputs=[mat.type()])
-        return node
-    def perform(self, node, inp, out):
-        val, mat = inp
-        (output,) = out
-        new_shape = (mat.shape[0], mat.shape[1] + 1)
-        if output[0] is None:
-            output[0] = np.empty(new_shape, dtype=mat.dtype)
-            out = output[0]
-        else:
-            if output[0].shape != new_shape:
-                try:
-                    output[0].resize(new_shape)
-                except Exception:
-                    output[0] = np.empty(new_shape, dtype=mat.dtype)
-            out = output[0]
-        out[:, 0].fill(val)
-        out[:, 1:] = mat
-    def infer_shape(self, fgraph, node, in_shapes):
-        shp = (in_shapes[1][0], in_shapes[1][1] + 1)
-        return [shp]
-    def grad(self, inp, grads):
-        val, mat = inp
-        (goutput,) = grads
-        return goutput[:, 0], goutput[:, 1:]
-prepend_scalar_to_each_row = Prepend_scalar_to_each_row()
-prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.0)
-prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.0)
-def relu(x, alpha=0):
-    """
-    Compute the element-wise rectified linear activation function.
-    .. versionadded:: 0.7.1
-    Parameters
-    ----------
-    x : symbolic tensor
-        Tensor to compute the activation function for.
-    alpha : `scalar or tensor, optional`
-        Slope for negative input, usually between 0 and 1. The default value
-        of 0 will lead to the standard rectifier, 1 will lead to
-        a linear activation function, and any value in between will give a
-        leaky rectifier. A shared variable (broadcastable against `x`) will
-        result in a parameterized rectifier with learnable slope(s).
-    Returns
-    -------
-    symbolic tensor
-        Element-wise rectifier applied to `x`.
-    Notes
-    -----
-    This is numerically equivalent to ``switch(x > 0, x, alpha * x)``
-    (or ``maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
-    formulation or an optimized Op, so we encourage to use this function.
-    """
-    # This is probably the fastest implementation for GPUs. Both the forward
-    # pass and the gradient get compiled into a single GpuElemwise call.
-    # TODO: Check if it's optimal for CPU as well; add an "if" clause if not.
-    # TODO: Check if there's a faster way for the gradient; create an Op if so.
-    if alpha == 0:
-        return 0.5 * (x + abs(x))
-    else:
-        # We can't use 0.5 and 1 for one and half.  as if alpha is a
-        # numpy dtype, they will be considered as float64, so would
-        # cause upcast to float64.
-        alpha = at.as_tensor_variable(alpha)
-        f1 = 0.5 * (1 + alpha)
-        f2 = 0.5 * (1 - alpha)
-        return f1 * x + f2 * abs(x)
-def h_softmax(
-    x,
-    batch_size,
-    n_outputs,
-    n_classes,
-    n_outputs_per_class,
-    W1,
-    b1,
-    W2,
-    b2,
-    target=None,
-):
-    """Two-level hierarchical softmax.
-    This function implements a two-layer hierarchical softmax. It is commonly
-    used as an alternative of the softmax when the number of outputs is
-    important (it is common to use it for millions of outputs). See
-    reference [1]_ for more information about the computational gains.
-    The `n_outputs` outputs are organized in `n_classes` classes, each class
-    containing the same number `n_outputs_per_class` of outputs.
-    For an input `x` (last hidden activation), the first softmax layer predicts
-    its class and the second softmax layer predicts its output among its class.
-    If `target` is specified, it will only compute the outputs of the
-    corresponding targets. Otherwise, if `target` is `None`, it will compute
-    all the outputs.
-    The outputs are grouped in classes in the same order as they are initially
-    defined: if `n_outputs=10` and `n_classes=2`, then the first class is
-    composed of the outputs labeled `{0,1,2,3,4}` while the second class is
-    composed of `{5,6,7,8,9}`. If you need to change the classes, you have to
-    re-label your outputs.
-    .. versionadded:: 0.7.1
-    Parameters
-    ----------
-    x: tensor of shape (batch_size, number of features)
-        the minibatch input of the two-layer hierarchical softmax.
-    batch_size: int
-        the size of the minibatch input x.
-    n_outputs: int
-        the number of outputs.
-    n_classes: int
-        the number of classes of the two-layer hierarchical softmax. It
-        corresponds to the number of outputs of the first softmax. See note at
-        the end.
-    n_outputs_per_class: int
-        the number of outputs per class. See note at the end.
-    W1: tensor of shape (number of features of the input x, n_classes)
-        the weight matrix of the first softmax, which maps the input x to the
-        probabilities of the classes.
-    b1: tensor of shape (n_classes,)
-        the bias vector of the first softmax layer.
-    W2: tensor of shape (n_classes, number of features of the input x,
-            n_outputs_per_class)
-        the weight matrix of the second softmax, which maps the input x to
-        the probabilities of the outputs.
-    b2: tensor of shape (n_classes, n_outputs_per_class)
-        the bias vector of the second softmax layer.
-    target: tensor of shape either (batch_size,) or (batch_size, 1)
-        (optional, default None)
-        contains the indices of the targets for the minibatch
-        input x. For each input, the function computes the output for its
-        corresponding target. If target is None, then all the outputs are
-        computed for each input.
-    Returns
-    -------
-    tensor of shape (`batch_size`, `n_outputs`) or (`batch_size`, 1)
-        Output tensor of the two-layer hierarchical softmax for input `x`.
-        Depending on argument `target`, it can have two different shapes.
-        If `target` is not specified (`None`), then all the outputs are
-        computed and the returned tensor has shape (`batch_size`, `n_outputs`).
-        Otherwise, when `target` is specified, only the corresponding outputs
-        are computed and the returned tensor has thus shape (`batch_size`, 1).
-    Notes
-    -----
-    The product of `n_outputs_per_class` and `n_classes` has to be greater or
-    equal to `n_outputs`. If it is strictly greater, then the irrelevant
-    outputs will be ignored.
-    `n_outputs_per_class` and `n_classes` have to be the same as the
-    corresponding dimensions of the tensors of `W1`, `b1`, `W2` and `b2`.
-    The most computational efficient configuration is when
-    `n_outputs_per_class` and `n_classes` are equal to the square root of
-    `n_outputs`.
-    Examples
-    --------
-    The following example builds a simple hierarchical softmax layer.
-    >>> import numpy as np
-    >>> import pytensor
-    >>> import pytensor.tensor as at
-    >>> from pytensor.tensor.nnet import h_softmax
-    >>>
-    >>> # Parameters
-    >>> batch_size = 32
-    >>> n_outputs = 100
-    >>> dim_x = 10  # dimension of the input
-    >>> n_classes = int(np.ceil(np.sqrt(n_outputs)))
-    >>> n_outputs_per_class = n_classes
-    >>> output_size = n_outputs_per_class * n_outputs_per_class
-    >>>
-    >>> # First level of h_softmax
-    >>> floatX = pytensor.config.floatX
-    >>> W1 = pytensor.shared(
-    ...     np.random.normal(0, 0.001, (dim_x, n_classes)).astype(floatX))
-    >>> b1 = pytensor.shared(np.zeros((n_classes,), floatX))
-    >>>
-    >>> # Second level of h_softmax
-    >>> W2 = np.random.normal(0, 0.001,
-    ...     size=(n_classes, dim_x, n_outputs_per_class)).astype(floatX)
-    >>> W2 = pytensor.shared(W2)
-    >>> b2 = pytensor.shared(np.zeros((n_classes, n_outputs_per_class), floatX))
-    >>>
-    >>> # We can now build the graph to compute a loss function, typically the
-    >>> # negative log-likelihood:
-    >>>
-    >>> x = at.imatrix('x')
-    >>> target = at.imatrix('target')
-    >>>
-    >>> # This only computes the output corresponding to the target.
-    >>> # The complexity is O(n_classes + n_outputs_per_class).
-    >>> y_hat_tg = h_softmax(x, batch_size, output_size, n_classes,
-    ...                      n_outputs_per_class, W1, b1, W2, b2, target)
-    >>>
-    >>> negll = -at.mean(at.log(y_hat_tg))
-    >>>
-    >>> # We may need to compute all the outputs (at test time usually):
-    >>>
-    >>> # This computes all the outputs.
-    >>> # The complexity is O(n_classes * n_outputs_per_class).
-    >>> output = h_softmax(x, batch_size, output_size, n_classes,
-    ...                    n_outputs_per_class, W1, b1, W2, b2)
-    References
-    ----------
-    .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training,"
-        ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`.
-    """
-    # First softmax that computes the probabilities of belonging to each class
-    class_probs = softmax(dot(x, W1) + b1)
-    if target is None:  # Computes the probabilities of all the outputs
-        # Second softmax that computes the output probabilities
-        activations = tensordot(x, W2, (1, 1)) + b2
-        output_probs = softmax(activations.reshape((-1, n_outputs_per_class)))
-        output_probs = output_probs.reshape((batch_size, n_classes, -1))
-        output_probs = class_probs.dimshuffle(0, 1, "x") * output_probs
-        output_probs = output_probs.reshape((batch_size, -1))
-        # output_probs.shape[1] is n_classes * n_outputs_per_class, which might
-        # be greater than n_outputs, so we ignore the potential irrelevant
-        # outputs with the next line:
-        output_probs = output_probs[:, :n_outputs]
-    else:  # Computes the probabilities of the outputs specified by the targets
-        target = target.flatten()
-        # Classes to which belong each target
-        target_classes = target // n_outputs_per_class
-        # Outputs to which belong each target inside a class
-        target_outputs_in_class = target % n_outputs_per_class
-        # Second softmax that computes the output probabilities
-        activations = sparse_block_dot(
-            W2.dimshuffle("x", 0, 1, 2),
-            x.dimshuffle(0, "x", 1),
-            at.zeros((batch_size, 1), dtype="int32"),
-            b2,
-            target_classes.dimshuffle(0, "x"),
-        )
-        output_probs = softmax(activations.dimshuffle(0, 2))
-        target_class_probs = class_probs[at.arange(batch_size), target_classes]
-        output_probs = output_probs[at.arange(batch_size), target_outputs_in_class]
-        output_probs = target_class_probs * output_probs
-    return output_probs
-def elu(x, alpha=1):
-    """
-    Compute the element-wise exponential linear activation function [2]_.
-    .. versionadded:: 0.8.0
-    Parameters
-    ----------
-    x : symbolic tensor
-        Tensor to compute the activation function for.
-    alpha : scalar
-    Returns
-    -------
-    symbolic tensor
-        Element-wise exponential linear activation function applied to `x`.
-    References
-    -----
-    .. [2] Djork-Arne Clevert,  Thomas Unterthiner, Sepp Hochreiter
-        "Fast and Accurate Deep Network Learning by
-        Exponential Linear Units (ELUs)" <http://arxiv.org/abs/1511.07289>`.
-    """
-    return at.switch(x > 0, x, alpha * expm1(x))
-def selu(x):
-    """Compute the element-wise Scaled Exponential Linear unit [3]_.
-    .. versionadded:: 0.9.0
-    Parameters
-    ----------
-    x : symbolic tensor
-        Tensor to compute the activation function for.
-    Returns
-    -------
-    symbolic tensor
-        Element-wise scaled exponential linear activation function applied to `x`.
-    References
-    ----------
-    .. [3] Klambauer G, Unterthiner T, Mayr A, Hochreiter S.
-        "Self-Normalizing Neural Networks" <https://arxiv.org/abs/1706.02515>
-    """
-    alpha = 1.6732632423543772848170429916717
-    scale = 1.0507009873554804934193349852946
-    return scale * elu(x, alpha)
-class ScalarSoftsign(UnaryScalarOp):
-    """
-    Softsign activation function
-    :math:`\\varphi(\\mathbf{x}) = \\frac{1}{1+|x|}`
-    """
-    @staticmethod
-    def static_impl(x):
-        return x / (1.0 + abs(x))
-    def impl(self, x):
-        return ScalarSoftsign.static_impl(x)
-    def grad(self, inp, grads):
-        (x,) = inp
-        (gz,) = grads
-        if "float" in x.type.dtype:
-            d = 1.0 + abs(x)
-            return [gz / (d * d)]
-        else:
-            return NotImplemented
-    def c_code(self, node, name, inp, out, sub):
-        (x,) = inp
-        (z,) = out
-        if node.inputs[0].type in [aes.float32, aes.float64]:
-            return f"{z} = {x} / (1.0+fabs({x}));"
-        raise NotImplementedError("only floating point x is implemented")
-scalar_softsign = ScalarSoftsign(aes.upgrade_to_float, name="scalar_softsign")
-softsign = Elemwise(scalar_softsign, name="softsign")
-def confusion_matrix(actual, pred):
-    """
-    Computes the confusion matrix of given vectors containing
-    actual observations and predicted observations.
-    Parameters
-    ----------
-    actual : 1-d tensor variable
-    pred : 1-d tensor variable
-    Returns
-    -------
-    conf_mat : Confusion matrix of actual and predictions observations as shown below.
-               | Predicted
-    ___________|___________
-       Actual  |
-               |
-    order : 1-d array of order of entries in rows and columns
-    Examples
-    --------
-    >>> import pytensor
-    >>> import pytensor.tensor as at
-    >>> from pytensor.tensor.nnet import confusion_matrix
-    >>> x = at.vector()
-    >>> y = at.vector()
-    >>> f = pytensor.function([x, y], confusion_matrix(x, y))
-    >>> y_true = [2, 0, 2, 2, 0, 1]
-    >>> y_pred = [0, 0, 2, 2, 0, 2]
-    >>> print(f(y_true, y_pred))
-    [array([[2, 0, 0],
-       [0, 0, 1],
-       [1, 0, 2]]), array([ 0.,  1.,  2.])]
-    """
-    if actual.ndim != 1:
-        raise ValueError("actual must be 1-d tensor variable")
-    if pred.ndim != 1:
-        raise ValueError("pred must be 1-d tensor variable")
-    order = Unique(False, False, False)(at.concatenate([actual, pred]))
-    colA = actual.dimshuffle(0, "x")
-    colP = pred.dimshuffle(0, "x")
-    oneHotA = eq(colA, order).astype("int64")
-    oneHotP = eq(colP, order).astype("int64")
-    conf_mat = dot(oneHotA.T, oneHotP)
-    return [conf_mat, order]
-DEPRECATED_NAMES = [
-    (
-        "softmax",
-        "`pytensor.tensor.nnet.basic.softmax` has been moved to `pytensor.tensor.special.softmax`.",
-        softmax,
-    ),
-    (
-        "logsoftmax",
-        "`pytensor.tensor.nnet.basic.logsoftmax` has been moved to `pytensor.tensor.special.log_softmax`.",
-        log_softmax,
-    ),
-]
-def __getattr__(name):
-    """Intercept module-level attribute access of deprecated symbols.
-    Adapted from https://stackoverflow.com/a/55139609/3006474.
-    """
-    from warnings import warn
-    for old_name, msg, old_object in DEPRECATED_NAMES:
-        if name == old_name:
-            warn(msg, DeprecationWarning, stacklevel=2)
-            return old_object
-    raise AttributeError(f"module {__name__} has no attribute {name}")
--- a/pytensor/tensor/nnet/batchnorm.py
+++ b/pytensor/tensor/nnet/batchnorm.py
-import numpy as np
-import pytensor
-from pytensor.configdefaults import config
-from pytensor.graph.basic import Apply
-from pytensor.graph.op import Op
-from pytensor.graph.rewriting.basic import copy_stack_trace, node_rewriter
-from pytensor.scalar import Composite, add, as_common_dtype, mul, sub, true_div
-from pytensor.tensor import basic as at
-from pytensor.tensor.basic import as_tensor_variable
-from pytensor.tensor.elemwise import Elemwise
-from pytensor.tensor.math import mean, prod, reciprocal, sqrt
-from pytensor.tensor.math import sum as at_sum
-from pytensor.tensor.rewriting.basic import register_specialize_device
-from pytensor.tensor.shape import specify_broadcastable
-from pytensor.tensor.type import TensorType
-class BNComposite(Composite):
-    init_param = ("dtype",)
-    @config.change_flags(compute_test_value="off")
-    def __init__(self, dtype):
-        self.dtype = dtype
-        x = pytensor.scalar.ScalarType(dtype=dtype).make_variable()
-        mean = pytensor.scalar.ScalarType(dtype=dtype).make_variable()
-        std = pytensor.scalar.ScalarType(dtype=dtype).make_variable()
-        gamma = pytensor.scalar.ScalarType(dtype=dtype).make_variable()
-        beta = pytensor.scalar.ScalarType(dtype=dtype).make_variable()
-        o = add(mul(true_div(sub(x, mean), std), gamma), beta)
-        inputs = [x, mean, std, gamma, beta]
-        outputs = [o]
-        super().__init__(inputs, outputs)
-    def grad(self, inps, grads):
-        x, mean, std, gamma, beta = inps
-        (top,) = grads
-        top_gamma = top * gamma
-        x_mean = x - mean
-        dx = top_gamma / std
-        dmean = -dx
-        dstd = -(top_gamma * x_mean) / (std * std)
-        dgamma = top * x_mean / std
-        return [dx, dmean, dstd, dgamma, top]
-def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"):
-    """
-    This function will build the symbolic graph for applying batch normalization
-    to a set of activations.
-    .. versionadded:: 0.7.1
-    Parameters
-    ----------
-    inputs : symbolic tensor
-        Mini-batch of activations
-    gamma: symbolic tensor
-        BN scale parameter, must be of same dimensionality as
-        inputs and broadcastable against it
-    beta: symbolic tensor
-        BN shift parameter, must be of same dimensionality as
-        inputs and broadcastable against it
-    mean: symbolic tensor
-        inputs means, must be of same dimensionality as
-        inputs and broadcastable against it
-    std: symbolic tensor
-        inputs standard deviation, must be of same dimensionality as
-        inputs and broadcastable against it
-    mode: 'low_mem' or 'high_mem'
-        Specify which batch_normalization implementation that will be
-        used.
-        As no intermediate representations are stored for the back-propagation,
-        'low_mem' implementation lower the memory usage, however,
-        it is 5-10% slower than 'high_mem' implementation. Note that 5-10% computation
-        time difference compare the batch_normalization operation only, time difference
-        between implementation is likely to be less important on the full model fprop/bprop.
-    """
-    if mode == "low_mem":
-        elm_bn = Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
-        rval = elm_bn(inputs, mean, std, gamma, beta)
-    elif mode == "high_mem":
-        rval = (inputs - mean) * (gamma / std) + beta
-    else:
-        raise ValueError('mode must be either "low_mem", "high_mem"')
-    return rval
-def _prepare_batch_normalization_axes(axes, ndim):
-    if axes == "per-activation":
-        axes = (0,)
-    elif axes == "spatial":
-        axes = (0,) + tuple(range(2, ndim))
-    elif isinstance(axes, (tuple, list, np.ndarray)):
-        axes = tuple(int(a) for a in axes)
-    else:
-        raise ValueError(f"invalid axes: {axes}")
-    axes = tuple(sorted(axes))
-    if len(axes) == 0:
-        raise ValueError("there should be at least one normalization axis")
-    if min(axes) < 0 or max(axes) >= ndim:
-        raise ValueError(
-            f"axes should be less than ndim (<{int(ndim)}), but {axes} given"
-        )
-    non_bc_axes = tuple(i for i in range(ndim) if i not in axes)
-    return axes, non_bc_axes
-def batch_normalization_train(
-    inputs,
-    gamma,
-    beta,
-    axes="per-activation",
-    epsilon=1e-4,
-    running_average_factor=0.1,
-    running_mean=None,
-    running_var=None,
-):
-    """
-    Performs batch normalization of the given inputs, using the mean and
-    variance of the inputs.
-    Parameters
-    ----------
-    axes : 'per-activation', 'spatial' or a tuple of ints
-        The axes along which the input should be normalized. ``'per-activation'``
-        normalizes per activation and is equal to ``axes=(0,)``.
-        ``'spatial'`` shares normalization factors across spatial dimensions
-        (i.e., all dimensions past the second), which for 4D inputs would be
-        equal to ``axes=(0, 2, 3)``.
-    gamma : tensor
-        Learnable scale factors. The shape must match the shape of `inputs`,
-        except for the axes in `axes`. These axes should be set to 1 or be
-        skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
-    beta : tensor
-        Learnable biases. Must match the tensor layout of `gamma`.
-    epsilon : float
-        Epsilon value used in the batch normalization formula. Minimum allowed
-        value is 1e-5 (imposed by cuDNN).
-    running_average_factor : float
-        Factor for updating the values or `running_mean` and `running_var`.
-        If the factor is close to one, the running averages will update quickly,
-        if the factor is close to zero it will update slowly.
-    running_mean : tensor or None
-        Previous value of the running mean. If this is given, the new value
-        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
-        will be returned as one of the outputs of this function.
-        `running_mean` and `running_var` should either both be given or
-        both be None. The shape should match that of `gamma` and `beta`.
-    running_var : tensor or None
-        Previous value of the running variance. If this is given, the new value
-        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
-        will be returned as one of the outputs of this function,
-        where `m` is the product of lengths of the averaged-over dimensions.
-        `running_mean` and `running_var` should either both be given or
-        both be None. The shape should match that of `gamma` and `beta`.
-    Returns
-    -------
-    out : tensor
-        Batch-normalized inputs.
-    mean : tensor
-        Means of `inputs` across the normalization axes.
-    invstd : tensor
-        Inverse standard deviations of `inputs` across the normalization axes.
-    new_running_mean : tensor
-        New value of the running mean (only if both `running_mean` and
-        `running_var` were given).
-    new_running_var : tensor
-        New value of the running variance (only if both `running_var` and
-        `running_mean` were given).
-    Notes
-    -----
-    If per-activation or spatial normalization is selected, this operation
-    will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
-    The returned values are equivalent to:
-    .. code-block:: python
-        # for per-activation normalization
-        axes = (0,)
-        # for spatial normalization
-        axes = (0,) + tuple(range(2, inputs.ndim))
-        mean = inputs.mean(axes, keepdims=True)
-        var = inputs.var(axes, keepdims=True)
-        invstd = at.reciprocal(at.sqrt(var + epsilon))
-        out = (inputs - mean) * gamma * invstd + beta
-        m = at.cast(ate.prod(inputs.shape) / at.prod(mean.shape), 'float32')
-        running_mean = running_mean * (1 - running_average_factor) + \\
-                       mean * running_average_factor
-        running_var = running_var * (1 - running_average_factor) + \\
-                      (m / (m - 1)) * var * running_average_factor
-    """
-    ndim = inputs.ndim
-    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
-    # have the parameter tensors been broadcasted yet?
-    if gamma.ndim == ndim:
-        params_ndim = ndim
-    else:
-        params_ndim = len(non_bc_axes)
-        params_dimshuffle_pattern = ["x"] * ndim
-        for i, axis in enumerate(non_bc_axes):
-            params_dimshuffle_pattern[axis] = i
-    if gamma.ndim != params_ndim or beta.ndim != params_ndim:
-        raise ValueError(
-            "gamma and beta dimensionality must match the "
-            "number of non-normalized axes, or have the "
-            "same number of dimensions as the inputs; "
-            f"got {int(gamma.ndim)} and {int(beta.ndim)} instead of {int(params_ndim)}"
-        )
-    if (running_mean is None) != (running_var is None):
-        raise ValueError(
-            "running_mean and running_var must either both be given or both be None"
-        )
-    if running_mean is not None and running_mean.ndim != params_ndim:
-        raise ValueError(
-            "running_mean must be of the same dimensionality "
-            f"as gamma and beta; got {int(running_mean.ndim)} instead of {int(params_ndim)}"
-        )
-    if running_var is not None and running_var.ndim != params_ndim:
-        raise ValueError(
-            "running_var must be of the same dimensionality "
-            f"as gamma and beta; got {int(running_var.ndim)} instead of {int(params_ndim)}"
-        )
-    # epsilon will be converted to floatX later. we need to check
-    # for rounding errors now, since numpy.float32(1e-5) < 1e-5.
-    epsilon = np.cast[config.floatX](epsilon)
-    if epsilon < 1e-5:
-        raise ValueError(f"epsilon must be at least 1e-5, got {epsilon}")
-    inputs = as_tensor_variable(inputs)
-    gamma = as_tensor_variable(gamma)
-    beta = as_tensor_variable(beta)
-    if params_ndim != ndim:
-        gamma = gamma.dimshuffle(params_dimshuffle_pattern)
-        beta = beta.dimshuffle(params_dimshuffle_pattern)
-    else:
-        gamma = specify_broadcastable(gamma, *axes)
-        beta = specify_broadcastable(beta, *axes)
-    batchnorm_op = AbstractBatchNormTrain(axes=axes)
-    if running_mean is not None and running_var is not None:
-        running_mean = as_tensor_variable(running_mean)
-        running_var = as_tensor_variable(running_var)
-        if params_ndim != ndim:
-            running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)
-            running_var = running_var.dimshuffle(params_dimshuffle_pattern)
-        else:
-            running_mean = specify_broadcastable(running_mean, *axes)
-            running_var = specify_broadcastable(running_var, *axes)
-        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
-            inputs,
-            gamma,
-            beta,
-            epsilon=epsilon,
-            running_average_factor=running_average_factor,
-            running_mean=running_mean,
-            running_var=running_var,
-        )
-        if new_running_mean.broadcastable != running_mean.broadcastable:
-            new_running_mean = specify_broadcastable(
-                new_running_mean,
-                *(ax for (ax, b) in enumerate(running_mean.type.broadcastable) if b),
-            )
-        if new_running_var.broadcastable != running_var.broadcastable:
-            new_running_var = specify_broadcastable(
-                new_running_var,
-                *(ax for (ax, b) in enumerate(running_var.type.broadcastable) if b),
-            )
-        results = (out, mean, invstd, new_running_mean, new_running_var)
-    else:
-        results = batchnorm_op(inputs, gamma, beta, epsilon=epsilon)
-    if params_ndim != ndim:
-        # remove the broadcasted dimensions (except from the output)
-        results = [results[0]] + [r.dimshuffle(non_bc_axes) for r in results[1:]]
-    return tuple(results)
-def batch_normalization_test(
-    inputs, gamma, beta, mean, var, axes="per-activation", epsilon=1e-4
-):
-    """
-    Performs batch normalization of the given inputs, using the given mean and
-    variance.
-    Parameters
-    ----------
-    axes : 'per-activation', 'spatial' or a tuple of ints
-        The axes along which the input should be normalized. ``'per-activation'``
-        normalizes per activation and is equal to ``axes=(0,)``.
-        ``'spatial'`` shares normalization factors across spatial dimensions
-        (i.e., all dimensions past the second), which for 4D inputs would be
-        equal to ``axes=(0, 2, 3)``.
-    gamma : tensor
-        Scale factors. The shape must match the shape of `inputs`,
-        except for the axes in `axes`. These axes should be set to 1 or be
-        skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
-    beta : tensor
-        Biases. Must match the tensor layout of `gamma`.
-    mean : tensor
-        Means. Usually these are running averages computed during training.
-        Must match the tensor layout of `gamma`.
-    var : tensor
-        Variances. Usually these are running averages computed during training.
-        Must match the tensor layout of `gamma`.
-    epsilon : float
-        Epsilon value used in the batch normalization formula. Minimum allowed
-        value is 1e-5 (imposed by cuDNN).
-    Returns
-    -------
-    out : tensor
-        Batch-normalized inputs.
-    Notes
-    -----
-    If per-activation or spatial normalization is selected, this operation
-    will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
-    The returned value is equivalent to:
-    .. code-block:: python
-        # for per-activation normalization
-        axes = (0,)
-        # for spatial normalization
-        axes = (0,) + tuple(range(2, inputs.ndim))
-        gamma, beta, mean, var = (at.specify_broadcastable(t, *axes)
-                                  for t in (gamma, beta, mean, var))
-        out = (inputs - mean) * gamma / at.sqrt(var + epsilon) + beta
-    """
-    ndim = inputs.ndim
-    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
-    # have the parameter tensors been broadcasted yet?
-    if gamma.ndim == ndim:
-        params_ndim = ndim
-    else:
-        params_ndim = len(non_bc_axes)
-        params_dimshuffle_pattern = ["x"] * ndim
-        for i, axis in enumerate(non_bc_axes):
-            params_dimshuffle_pattern[axis] = i
-    if gamma.ndim != params_ndim or beta.ndim != params_ndim:
-        raise ValueError(
-            "gamma and beta dimensionality must match the "
-            "number of non-normalized axes, or have the "
-            "same number of dimensions as the inputs; "
-            f"got {int(gamma.ndim)} and {int(beta.ndim)} instead of {int(params_ndim)}"
-        )
-    if mean.ndim != params_ndim or var.ndim != params_ndim:
-        raise ValueError(
-            "mean and var must be of the same dimensionality "
-            f"as gamma and beta; got {int(mean.ndim)} and {int(var.ndim)} instead of {int(params_ndim)}"
-        )
-    # epsilon will be converted to floatX later. we need to check
-    # for rounding errors now, since numpy.float32(1e-5) < 1e-5.
-    epsilon = np.cast[config.floatX](epsilon)
-    if epsilon < 1e-5:
-        raise ValueError(f"epsilon must be at least 1e-5, got {epsilon}")
-    gamma = as_tensor_variable(gamma)
-    beta = as_tensor_variable(beta)
-    mean = as_tensor_variable(mean)
-    var = as_tensor_variable(var)
-    if params_ndim != ndim:
-        gamma = gamma.dimshuffle(params_dimshuffle_pattern)
-        beta = beta.dimshuffle(params_dimshuffle_pattern)
-        mean = mean.dimshuffle(params_dimshuffle_pattern)
-        var = var.dimshuffle(params_dimshuffle_pattern)
-    else:
-        gamma = specify_broadcastable(gamma, *axes)
-        beta = specify_broadcastable(beta, *axes)
-        mean = specify_broadcastable(mean, *axes)
-        var = specify_broadcastable(var, *axes)
-    batchnorm_op = AbstractBatchNormInference(axes=axes)
-    return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)
-class AbstractBatchNormTrain(Op):
-    """
-    Abstract Op for Batch Normalization.
-    Parameters
-    ----------
-    axes : a tuple of ints
-        The axes along which the input should be normalized.
-    x : tensor
-        The input to be normalized along `axes`.
-    scale : tensor
-        `scale` should have the same number of dimensions as `x`.
-        All dimensions listed in `axes` should have length 1.
-    bias : tensor
-        `bias` should have the same number of dimensions as `x`.
-        All dimensions listed in `axes` should have length 1.
-    epsilon
-        Epsilon value used in the batch normalization formula. Minimum allowed
-        value is 1e-5 (imposed by cuDNN).
-    running_average_factor : float
-        Factor for updating the values or `running_mean` and `running_var`.
-        If the factor is close to one, the running averages will update quickly,
-        if the factor is close to zero it will update slowly.
-    running_mean : tensor or None
-        Previous value of the running mean. If this is given, the new value
-        ``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
-        will be returned as one of the outputs of this function.
-        `running_mean` and `running_var` should either both be given or
-        both be None.
-    running_var : tensor or None
-        Previous value of the running variance. If this is given, the new value
-        ``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
-        will be returned as one of the outputs of this function,
-        where `m` is the product of lengths of the averaged-over dimensions.
-        `running_mean` and `running_var` should either both be given or
-        both be None.
-    """
-    __props__ = ("axes",)
-    def __init__(self, axes=(0,)):
-        assert isinstance(axes, (tuple, list))
-        assert len(axes) > 0
-        axes = tuple(int(a) for a in axes)
-        self.axes = axes
-    def infer_shape(self, fgraph, node, shape):
-        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
-    def make_node(
-        self,
-        x,
-        scale,
-        bias,
-        epsilon=1e-4,
-        running_average_factor=0.1,
-        running_mean=None,
-        running_var=None,
-    ):
-        x = as_tensor_variable(x)
-        scale = as_tensor_variable(scale)
-        bias = as_tensor_variable(bias)
-        epsilon = as_tensor_variable(epsilon)
-        running_average_factor = as_tensor_variable(running_average_factor)
-        if running_mean is not None:
-            running_mean = as_tensor_variable(running_mean)
-        if running_var is not None:
-            running_var = as_tensor_variable(running_var)
-        assert x.ndim == scale.ndim == bias.ndim
-        assert (running_mean is None and running_var is None) or (
-            running_mean is not None and running_var is not None
-        )
-        assert running_mean is None or running_mean.ndim == x.ndim
-        assert running_var is None or running_var.ndim == x.ndim
-        # Upcast to common dtype on the non-scalar
-        # Keep as is dtype of scalar (epsilon and running_average_factor)
-        if running_mean:
-            x, scale, bias, running_mean, running_var = as_common_dtype(
-                x, scale, bias, running_mean, running_var
-            )
-        else:
-            x, scale, bias = as_common_dtype(x, scale, bias)
-        inputs = [x, scale, bias, epsilon, running_average_factor]
-        output_types = [x.type(), scale.type(), scale.type()]
-        if running_mean is not None and running_var is not None:
-            inputs.append(running_mean)
-            inputs.append(running_var)
-            output_types.append(scale.type())
-            output_types.append(scale.type())
-        return Apply(self, inputs, output_types)
-    def L_op(self, inputs, outputs, grads):
-        x, scale, bias, epsilon, running_average_factor = inputs[:5]
-        dy = grads[0]
-        _, x_mean, x_invstd = outputs[:3]
-        disconnected_outputs = [
-            pytensor.gradient.DisconnectedType()(),  # epsilon
-            pytensor.gradient.DisconnectedType()(),
-        ]  # running_average_factor
-        # Optional running_mean and running_var.
-        for i in range(5, len(inputs)):
-            disconnected_outputs.append(pytensor.gradient.DisconnectedType()())
-        return (
-            AbstractBatchNormTrainGrad(self.axes)(
-                x, dy, scale, x_mean, x_invstd, epsilon
-            )
-            + disconnected_outputs
-        )
-    def connection_pattern(self, node):
-        # Specify that epsilon and running_average_factor are not connected to outputs.
-        patterns = [
-            [True, True, True],  # x
-            [True, True, True],  # scale
-            [True, True, True],  # bias
-            [False, False, False],  # epsilon
-            [False, False, False],
-        ]  # running_average_factor
-        # Optional running_mean and running_var are only
-        # connected to their new values.
-        for i in range(5, len(node.inputs)):
-            patterns[0].append(True)
-            for pattern in patterns[1:]:
-                pattern.append(False)
-            patterns.append([False] * (3 + i - 5) + [True])
-        return patterns
-    def perform(self, node, inputs, output_storage):
-        x, scale, bias, epsilon, running_average_factor = inputs[:5]
-        axes = self.axes
-        if min(axes) < 0 or max(axes) >= x.ndim:
-            raise ValueError(
-                f"axes should be less than ndim (<{x.ndim}), but {axes} given"
-            )
-        mean = x.mean(axes, keepdims=True)
-        var = x.var(axes, keepdims=True)
-        invstd = 1.0 / np.sqrt(var + epsilon)
-        out = (x - mean) * (scale * invstd) + bias
-        output_storage[0][0] = out
-        output_storage[1][0] = mean
-        output_storage[2][0] = invstd
-        if len(inputs) > 5:
-            running_mean = inputs[5]
-            running_mean = (
-                running_mean * (1.0 - running_average_factor)
-                + mean * running_average_factor
-            )
-            output_storage[3][0] = running_mean
-        if len(inputs) > 6:
-            m = float(np.prod(x.shape) / np.prod(scale.shape))
-            running_var = inputs[6]
-            running_var = (
-                running_var * (1.0 - running_average_factor)
-                + (m / (m - 1)) * var * running_average_factor
-            )
-            output_storage[4][0] = running_var
-class AbstractBatchNormInference(Op):
-    """
-    Abstract Op for Batch Normalization.
-    Parameters
-    ----------
-    axes : a tuple of ints
-        The axes along which the input is normalized.
-    epsilon
-        Epsilon value used in the batch normalization formula. Minimum allowed
-        value is 1e-5 (imposed by cuDNN).
-    """
-    __props__ = ("axes",)
-    def __init__(self, axes=(0,)):
-        assert isinstance(axes, (tuple, list))
-        assert len(axes) > 0
-        axes = tuple(int(a) for a in axes)
-        self.axes = axes
-    def infer_shape(self, fgraph, node, shape):
-        return [shape[0]]
-    def make_node(
-        self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4
-    ):
-        x = as_tensor_variable(x)
-        scale = as_tensor_variable(scale)
-        bias = as_tensor_variable(bias)
-        estimated_mean = as_tensor_variable(estimated_mean)
-        estimated_variance = as_tensor_variable(estimated_variance)
-        epsilon = as_tensor_variable(epsilon)
-        # Upcast to common dtype on the non-scalar
-        # Keep as is dtype of scalar (epsilon)
-        x, scale, bias, estimated_mean, estimated_variance = as_common_dtype(
-            x, scale, bias, estimated_mean, estimated_variance
-        )
-        assert (
-            x.ndim
-            == scale.ndim
-            == bias.ndim
-            == estimated_mean.ndim
-            == estimated_variance.ndim
-        )
-        return Apply(
-            self,
-            [x, scale, bias, estimated_mean, estimated_variance, epsilon],
-            [x.type()],
-        )
-    def grad(self, inputs, grads):
-        x, scale, bias, est_mean, est_var, epsilon = inputs
-        dy = grads[0]
-        axes = self.axes
-        if min(axes) < 0 or max(axes) >= x.ndim:
-            raise ValueError(
-                f"axes should be less than ndim (<{x.ndim}), but {axes} given"
-            )
-        scale, bias, est_mean, est_var = (
-            specify_broadcastable(t, *axes) for t in (scale, bias, est_mean, est_var)
-        )
-        # define helper expressions
-        est_var_eps = est_var + epsilon
-        est_std = sqrt(est_var_eps)
-        two = at.constant(2.0)
-        # define and return gradients
-        dx = dy * (scale / est_std)
-        dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
-        dbias = dy.sum(axes, keepdims=True)
-        dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
-        dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (
-            scale / (two * est_var_eps * est_std)
-        )
-        return [dx, dscale, dbias, dmean, dvar, pytensor.gradient.DisconnectedType()()]
-    def connection_pattern(self, node):
-        # Specify that epsilon is not connected to outputs.
-        return [[True], [True], [True], [True], [True], [False]]
-    def perform(self, node, inputs, output_storage):
-        x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
-        out = (x - estimated_mean) * (
-            scale / np.sqrt(estimated_variance + epsilon)
-        ) + bias
-        output_storage[0][0] = out
-class AbstractBatchNormTrainGrad(Op):
-    __props__ = ("axes",)
-    def __init__(self, axes=(0,)):
-        assert isinstance(axes, (tuple, list))
-        assert len(axes) > 0
-        axes = tuple(int(a) for a in axes)
-        self.axes = axes
-    def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
-        x = as_tensor_variable(x)
-        dy = as_tensor_variable(dy)
-        scale = as_tensor_variable(scale)
-        x_mean = as_tensor_variable(x_mean)
-        x_invstd = as_tensor_variable(x_invstd)
-        epsilon = as_tensor_variable(epsilon)
-        # Upcast to common dtype on the non-scalar
-        # Keep as is dtype of scalar (epsilon)
-        x, dy, scale, x_mean, x_invstd = as_common_dtype(x, dy, scale, x_mean, x_invstd)
-        assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
-        return Apply(
-            self,
-            [x, dy, scale, x_mean, x_invstd, epsilon],
-            [x.type(), scale.type(), scale.type()],
-        )
-    def grad(self, inp, grads):
-        x, dy, scale, x_mean, x_invstd, epsilon = inp
-        ddinputs, ddscale, ddbias = grads
-        x_diff = x - x_mean
-        mean_dy_x_diff = mean(dy * x_diff, axis=self.axes, keepdims=True)
-        # compute gradients given each of the output gradients
-        g_wrt_x = 0
-        g_wrt_dy = 0
-        g_wrt_scale = 0
-        g_wrt_x_mean = 0
-        g_wrt_x_invstd = 0
-        if not isinstance(ddinputs.type, pytensor.gradient.DisconnectedType):
-            ccc = scale * (ddinputs - mean(ddinputs, axis=self.axes, keepdims=True))
-            ddd = (x_invstd**3) * (
-                ccc * mean(dy * x_diff, axis=self.axes, keepdims=True)
-                + dy * mean(ccc * x_diff, axis=self.axes, keepdims=True)
-            )
-            g_wrt_x = g_wrt_x - ddd
-            g_wrt_dy = g_wrt_dy + (
-                (ccc * x_invstd)
-                - (
-                    (x_invstd**3)
-                    * x_diff
-                    * mean(ccc * x_diff, axis=self.axes, keepdims=True)
-                )
-            )
-            eee = (dy * x_invstd) - ((x_invstd**3) * x_diff * mean_dy_x_diff)
-            g_wrt_scale = g_wrt_scale + at_sum(
-                ddinputs * (eee - mean(eee, axis=self.axes, keepdims=True)),
-                axis=self.axes,
-                keepdims=True,
-            )
-            g_wrt_x_mean = g_wrt_x_mean + at_sum(ddd, axis=self.axes, keepdims=True)
-            g_wrt_x_invstd = g_wrt_x_invstd + at_sum(
-                ccc * (dy - 3 * (x_invstd**2) * x_diff * mean_dy_x_diff),
-                axis=self.axes,
-                keepdims=True,
-            )
-        if not isinstance(ddscale.type, pytensor.gradient.DisconnectedType):
-            g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy)
-            g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff)
-            g_wrt_x_mean = g_wrt_x_mean - (
-                x_invstd * ddscale * at_sum(dy, axis=self.axes, keepdims=True)
-            )
-            g_wrt_x_invstd = g_wrt_x_invstd + (
-                ddscale * at_sum(dy * x_diff, axis=self.axes, keepdims=True)
-            )
-        if not isinstance(ddbias.type, pytensor.gradient.DisconnectedType):
-            g_wrt_dy = g_wrt_dy + at.fill(dy, ddbias)
-        # depending on which output gradients are given,
-        # some inputs should be disconnected
-        results = [
-            g_wrt_x,
-            g_wrt_dy,
-            g_wrt_scale,
-            g_wrt_x_mean,
-            g_wrt_x_invstd,
-            pytensor.gradient.DisconnectedType()(),
-        ]
-        return [
-            pytensor.gradient.DisconnectedType()()
-            if (isinstance(r, int) and r == 0)
-            else r
-            for r in results
-        ]
-    def connection_pattern(self, node):
-        return [
-            [True, True, False],  # x
-            [True, True, True],  # dy
-            [True, False, False],  # scale
-            [True, True, False],  # x_mean
-            [True, True, False],  # x_invstd
-            [False, False, False],
-        ]  # epsilon
-    def infer_shape(self, fgraph, node, shape):
-        return [shape[0], shape[2], shape[2]]
-    def perform(self, node, inputs, output_storage):
-        x, dy, scale, x_mean, x_invstd, epsilon = inputs
-        axes = self.axes
-        if min(axes) < 0 or max(axes) >= x.ndim:
-            raise ValueError(
-                f"axes should be less than ndim (<{x.ndim}), but {axes} given"
-            )
-        x_diff = x - x_mean
-        mean_dy_x_diff = np.mean(dy * x_diff, axis=axes, keepdims=True)
-        c = (dy * x_invstd) - (x_diff * mean_dy_x_diff * (x_invstd**3))
-        g_wrt_inputs = scale * (c - np.mean(c, axis=axes, keepdims=True))
-        g_wrt_scale = np.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
-        g_wrt_bias = np.sum(dy, axis=axes, keepdims=True)
-        output_storage[0][0] = g_wrt_inputs
-        output_storage[1][0] = g_wrt_scale
-        output_storage[2][0] = g_wrt_bias
-@node_rewriter([AbstractBatchNormTrain])
-def local_abstract_batch_norm_train(fgraph, node):
-    if not isinstance(node.op, AbstractBatchNormTrain):
-        return None
-    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
-    axes = node.op.axes
-    if min(axes) < 0 or max(axes) > x.ndim:
-        return None
-    if (
-        not isinstance(x.type, TensorType)
-        or not isinstance(scale.type, TensorType)
-        or not isinstance(bias.type, TensorType)
-        or not isinstance(epsilon.type, TensorType)
-        or not isinstance(running_average_factor.type, TensorType)
-    ):
-        return None
-    # optional running_mean and running_var
-    if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):
-        return None
-    if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):
-        return None
-    mean = x.mean(axes, keepdims=True)
-    var = x.var(axes, keepdims=True)
-    # The epsilon should not upcast the dtype.
-    if var.dtype == "float32" and epsilon.dtype == "float64":
-        epsilon = epsilon.astype("float32")
-    invstd = reciprocal(sqrt(var + epsilon))
-    out = (x - mean) * (scale * invstd) + bias
-    results = [out, mean, invstd]
-    if len(node.inputs) > 5:
-        running_mean = node.inputs[5]
-        running_mean = (
-            running_mean * (1.0 - running_average_factor)
-            + mean * running_average_factor
-        )
-        results.append(running_mean)
-    if len(node.inputs) > 6:
-        m = at.cast(prod(x.shape) / prod(scale.shape), config.floatX)
-        running_var = node.inputs[6]
-        running_var = (
-            running_var * (1.0 - running_average_factor)
-            + (m / (m - 1)) * var * running_average_factor
-        )
-        results.append(running_var)
-    for var in pytensor.graph.basic.vars_between(node.inputs, results):
-        if var not in node.inputs:
-            copy_stack_trace(node.outputs[0], var)
-    return results
-@node_rewriter([AbstractBatchNormTrainGrad])
-def local_abstract_batch_norm_train_grad(fgraph, node):
-    if not isinstance(node.op, AbstractBatchNormTrainGrad):
-        return None
-    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
-    axes = node.op.axes
-    if min(axes) < 0 or max(axes) > x.ndim:
-        return None
-    if (
-        not isinstance(x.type, TensorType)
-        or not isinstance(dy.type, TensorType)
-        or not isinstance(scale.type, TensorType)
-        or not isinstance(x_mean.type, TensorType)
-        or not isinstance(x_invstd.type, TensorType)
-        or not isinstance(epsilon.type, TensorType)
-    ):
-        return None
-    x_diff = x - x_mean
-    mean_dy_x_diff = mean(dy * x_diff, axis=axes, keepdims=True)
-    c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd**3))
-    g_wrt_inputs = scale * (c - mean(c, axis=axes, keepdims=True))
-    g_wrt_scale = at_sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
-    g_wrt_bias = at_sum(dy, axis=axes, keepdims=True)
-    results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
-    for var in pytensor.graph.basic.vars_between(node.inputs, results):
-        if var not in node.inputs:
-            copy_stack_trace(node.outputs[0], var)
-    return results
-@node_rewriter([AbstractBatchNormInference])
-def local_abstract_batch_norm_inference(fgraph, node):
-    if not isinstance(node.op, AbstractBatchNormInference):
-        return None
-    x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
-    if (
-        not isinstance(x.type, TensorType)
-        or not isinstance(scale.type, TensorType)
-        or not isinstance(bias.type, TensorType)
-        or not isinstance(estimated_mean.type, TensorType)
-        or not isinstance(estimated_variance.type, TensorType)
-        or not isinstance(epsilon.type, TensorType)
-    ):
-        return None
-    # The epsilon should not upcast the dtype.
-    if estimated_variance.dtype == "float32" and epsilon.dtype == "float64":
-        epsilon = epsilon.astype("float32")
-    result = (x - estimated_mean) * (scale / sqrt(estimated_variance + epsilon)) + bias
-    for var in pytensor.graph.basic.vars_between(node.inputs, [result]):
-        if var not in node.inputs:
-            copy_stack_trace(node.outputs[0], var)
-    return [result]
-# Register Cpu Optimization
-bn_groupopt = pytensor.graph.rewriting.db.LocalGroupDB()
-bn_groupopt.__name__ = "batchnorm_opts"
-register_specialize_device(bn_groupopt, "fast_compile", "fast_run")
-bn_groupopt.register(
-    "local_abstract_batch_norm_train",
-    local_abstract_batch_norm_train,
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-bn_groupopt.register(
-    "local_abstract_batch_norm_train_grad",
-    local_abstract_batch_norm_train_grad,
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-bn_groupopt.register(
-    "local_abstract_batch_norm_inference",
-    local_abstract_batch_norm_inference,
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
--- a/pytensor/tensor/nnet/blocksparse.py
+++ b/pytensor/tensor/nnet/blocksparse.py
-from typing import List
-import numpy as np
-import pytensor
-from pytensor.gradient import grad_undefined
-from pytensor.graph.basic import Apply
-from pytensor.graph.op import Op
-from pytensor.tensor.type import discrete_dtypes
-class SparseBlockGemv(Op):
-    """
-    This op computes the dot product of specified pieces of vectors
-    and matrices, returning pieces of vectors::
-        for b in range(batch_size):
-            for j in range(o.shape[1]):
-                for i in range(h.shape[1]):
-                    o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
-    where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
-    .. image:: ../../../images/blocksparse.png
-        :scale: 50 %
-    """
-    __props__ = ("inplace",)
-    registered_opts: List = []
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, o, W, h, inputIdx, outputIdx):
-        """
-        Compute the dot product of the specified pieces of vectors
-        and matrices.
-        The parameter types are actually their expected shapes
-        relative to each other.
-        Parameters
-        ----------
-        o : batch, oWin, oSize
-            output vector
-        W : iBlocks, oBlocks, iSize, oSize
-            weight matrix
-        h : batch, iWin, iSize
-            input from lower layer (sparse)
-        inputIdx : batch, iWin
-            indexes of the input blocks
-        outputIdx : batch, oWin
-            indexes of the output blocks
-        Returns
-        -------
-        (batch, oWin, oSize)
-            dot(W[i, j], h[i]) + o[j]
-        Notes
-        -----
-        - `batch` is the number of examples in a minibatch (batch size).
-        - `iBlocks` is the total number of blocks in the input (from lower
-            layer).
-        - `iSize` is the size of each of these input blocks.
-        - `iWin` is the number of blocks that will be used as inputs. Which
-           blocks will be used is specified in `inputIdx`.
-        - `oBlocks` is the number or possible output blocks.
-        - `oSize` is the size of each of these output blocks.
-        - `oWin` is the number of output blocks that will actually be computed.
-            Which blocks will be computed is specified in `outputIdx`.
-        """
-        o = pytensor.tensor.as_tensor_variable(o)
-        W = pytensor.tensor.as_tensor_variable(W)
-        h = pytensor.tensor.as_tensor_variable(h)
-        inputIdx = pytensor.tensor.as_tensor_variable(inputIdx)
-        outputIdx = pytensor.tensor.as_tensor_variable(outputIdx)
-        if o.ndim != 3:
-            raise TypeError("The output o must be a 2D tensor")
-        if W.ndim != 4:
-            raise TypeError("The weight matrix W must be a 4D tensor")
-        if h.ndim != 3:
-            raise TypeError("The input h must be a 3D tensor")
-        if inputIdx.ndim != 2:
-            raise TypeError("The input indices inputIdx must be a 2D tensor")
-        if outputIdx.ndim != 2:
-            raise TypeError("The output indices outputIdx must be a 2D tensor")
-        assert inputIdx.type.dtype in discrete_dtypes
-        assert outputIdx.type.dtype in discrete_dtypes
-        return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])
-    def perform(self, node, inp, out_):
-        o, W, h, iIdx, oIdx = inp[:5]
-        if not self.inplace:
-            o = o.copy()
-        for b in range(o.shape[0]):
-            for j in range(o.shape[1]):
-                outputIdx = oIdx[b, j]
-                for i in range(h.shape[1]):
-                    inputIdx = iIdx[b, i]
-                    w = W[inputIdx, outputIdx]
-                    o[b, j, :] += np.dot(h[b, i], w)
-        out_[0][0] = o
-    def infer_shape(self, fgraph, node, input_shapes):
-        return [input_shapes[0]]
-    def grad(self, inputs, grads):
-        o, W, h, inputIdx, outputIdx = inputs
-        go = grads[0]
-        outer_fun = SparseBlockOuter(self.inplace)
-        gemv_fun = SparseBlockGemv(self.inplace)
-        Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
-        hgrad = gemv_fun(
-            h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx
-        )
-        return [
-            go,
-            Wgrad,
-            hgrad,
-            grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"),
-            grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"),
-        ]
-class SparseBlockOuter(Op):
-    """
-    This computes the outer product of two sets of pieces of vectors
-    updating a full matrix with the results::
-        for b in range(batch_size):
-            o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
-    This op is involved in the gradient of SparseBlockGemv.
-    """
-    __props__ = ("inplace",)
-    registered_opts: List = []
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
-        """
-        Compute the dot product of the specified pieces of vectors
-        and matrices.
-        The parameter types are actually their expected shapes
-        relative to each other.
-        Parameters
-        ----------
-        o : xBlocks, yBlocks, xSize, ySize
-        x : batch, xWin, xSize
-        y : batch, yWin, ySize
-        xIdx : batch, iWin
-            indexes of the x blocks
-        yIdx : batch, oWin
-            indexes of the y blocks
-        Returns
-        -------
-        (xBlocks, yBlocks, xSize, ySize)
-            outer(x[i], y[j]) + o[i, j]
-        Notes
-        -----
-        - `batch` is the number of examples in a minibatch (batch size).
-        - `xBlocks` is the total number of blocks in x.
-        - `xSize` is the size of each of these x blocks.
-        - `xWin` is the number of blocks that will be used as x. Which blocks
-          will be used is specified in `xIdx`.
-        - `yBlocks` is the number or possible y blocks.
-        - `ySize` is the size of each of these y blocks.
-        - `yWin` is the number of y blocks that will actually be computed.
-          Which blocks will be computed is specified in `yIdx`.
-        """
-        one = pytensor.tensor.constant(np.asarray(1.0, dtype="float32"))
-        o = pytensor.tensor.as_tensor_variable(o)
-        x = pytensor.tensor.as_tensor_variable(x)
-        y = pytensor.tensor.as_tensor_variable(y)
-        if alpha is None:
-            alpha = one
-        return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
-    def infer_shape(self, fgraph, node, input_shapes):
-        return [input_shapes[0]]
-    def perform(self, node, inp, out_):
-        o, x, y, xIdx, yIdx, alpha = inp[:6]
-        if not self.inplace:
-            o = o.copy()
-        for b in range(x.shape[0]):
-            for i in range(xIdx.shape[1]):
-                for j in range(yIdx.shape[1]):
-                    o[xIdx[b, i], yIdx[b, j]] += np.outer(x[b, i], y[b, j, :])
-        out_[0][0] = o
-sparse_block_gemv = SparseBlockGemv(False)
-sparse_block_gemv_inplace = SparseBlockGemv(True)
-sparse_block_outer = SparseBlockOuter(False)
-sparse_block_outer_inplace = SparseBlockOuter(True)
-def sparse_block_dot(W, h, inputIdx, b, outputIdx):
-    """
-    Compute the dot product (plus bias) of the specified pieces of vectors
-    and matrices. See SparseBlockGemv to get more information.
-    The parameter types are actually their expected shapes relative to
-    each other.
-    Parameters
-    ----------
-    W : iBlocks, oBlocks, iSize, oSize
-        weight matrix
-    h : batch, iWin, iSize
-        input from lower layer (sparse)
-    inputIdx : batch, iWin
-        indexes of the input blocks
-    b : oBlocks, oSize
-        bias vector
-    outputIdx : batch, oWin
-        indexes of the output blocks
-    Returns
-    -------
-    (batch, oWin, oSize)
-        dot(W[i, j], h[i]) + b[j] but b[j] is only added once
-    Notes
-    -----
-    - `batch` is the number of examples in a minibatch (batch size).
-    - `iBlocks` is the total number of blocks in the input (from lower layer).
-    - `iSize` is the size of each of these input blocks.
-    - `iWin` is the number of blocks that will be used as inputs. Which blocks
-       will be used is specified in `inputIdx`.
-    - `oBlocks` is the number or possible output blocks.
-    - `oSize` is the size of each of these output blocks.
-    - `oWin` is the number of output blocks that will actually be computed.
-       Which blocks will be computed is specified in `outputIdx`.
-    """
-    assert inputIdx.ndim == h.ndim - 1
-    assert outputIdx.ndim == inputIdx.ndim
-    if h.ndim == 2:
-        h = h.dimshuffle("x", 0, 1)
-        inputIdx = inputIdx.dimshuffle("x", 0)
-        outputIdx = outputIdx.dimshuffle("x", 0)
-    return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h, inputIdx, outputIdx)
--- a/pytensor/tensor/nnet/c_code/ctc_wrapper.c
+++ b/pytensor/tensor/nnet/c_code/ctc_wrapper.c
-#section support_code
-typedef struct ctc_context {
-    struct ctcOptions options;
-    void * workspace;
-    int * input_lengths;
-    int * flat_labels;
-    int * label_lengths;
-} ctc_context_t;
-void ctc_context_init(ctc_context_t * context)
-{
-    struct ctcOptions * options = &(context->options);
-    memset(options, 0, sizeof(struct ctcOptions));
-    options->loc = CTC_CPU;
-#if defined(_OPENMP)
-    options->num_threads = omp_get_num_threads();
-#else
-    options->num_threads = 1;
-#endif
-    context->workspace = NULL;
-    context->input_lengths = NULL;
-    context->flat_labels = NULL;
-    context->label_lengths = NULL;
-}
-void ctc_context_destroy(ctc_context_t * context)
-{
-    free( context->workspace );
-    free( context->input_lengths );
-    free( context->flat_labels );
-    free( context->label_lengths );
-}
-int ctc_check_result(ctcStatus_t retcode, const char * msg)
-{
-    if( CTC_STATUS_SUCCESS != retcode )
-    {
-        // Get error message from underlying library
-        const char * ctc_msg = ctcGetStatusString( retcode );
-        PyErr_Format( PyExc_RuntimeError,
-                      "ConnectionistTemporalClassification: %s CTC error: %s",
-                      msg,
-                      ctc_msg );
-        return 1;
-    }
-    return 0;
-}
-void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
-    int ** input_lengths )
-{
-    npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
-    *input_lengths = (int *) calloc( num_elements, sizeof(int) );
-    if ( NULL == (*input_lengths) )
-        return;
-    for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
-    {
-        (*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
-    }
-}
-void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
-    int ** label_lengths )
-{
-    npy_int rows = PyArray_DIMS( label_matrix )[0];
-    npy_int cols = PyArray_DIMS( label_matrix )[1];
-    *flat_labels = (int *) calloc( rows * cols, sizeof(int) );
-    if ( NULL == (*flat_labels) )
-        return;
-    *label_lengths = (int *) calloc( rows, sizeof(int) );
-    if ( NULL == (*label_lengths) )
-    {
-        free( *flat_labels );
-        *flat_labels = NULL;
-        return;
-    }
-    npy_int label_index = 0;
-    for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
-    {
-        npy_int label_length = 0;
-        for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
-        {
-            npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
-            if ( label >= 0 )  // negative values are assumed to be padding
-            {
-                (*flat_labels)[ label_index++ ] = label;
-                ++label_length;
-            }
-        }
-        (*label_lengths)[ row_idx ] = label_length;
-    }
-}
-#section support_code_apply
-int APPLY_SPECIFIC(ctc_cost_cpu)(PyArrayObject *  in_activations,
-                                 PyArrayObject *  in_labels,
-                                 PyArrayObject *  in_input_lengths,
-                                 PyArrayObject ** out_costs,
-                                 PyArrayObject ** out_gradients)
-{
-    ctc_context_t ctc_object;
-    ctc_context_t * context = &ctc_object;
-    ctc_context_init( context );
-    if ( !PyArray_IS_C_CONTIGUOUS( in_activations ) )
-    {
-        PyErr_SetString( PyExc_RuntimeError,
-            "ConnectionistTemporalClassification: activations array must be C-contiguous." );
-        return 1;
-    }
-    npy_float32 * activations = (npy_float32 *) PyArray_DATA( in_activations );
-    create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
-    if ( NULL == context->input_lengths )
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        PyErr_Format( PyExc_MemoryError,
-            "ConnectionistTemporalClassification: Could not allocate memory for input lengths" );
-        return 1;
-    }
-    // flatten labels to conform with library memory layout
-    create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
-    if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        PyErr_Format( PyExc_MemoryError,
-            "ConnectionistTemporalClassification: Could not allocate memory for labels and their lengths" );
-        return 1;
-    }
-    npy_int minibatch_size = PyArray_DIMS( in_activations )[1];
-    npy_int alphabet_size = PyArray_DIMS( in_activations )[2];
-    npy_float32 * costs = NULL;
-    npy_intp cost_size = minibatch_size;
-    if ( (*out_costs) == NULL ||                       // Symbolic variable has no memory backing
-         PyArray_NDIM( *out_costs ) != 1 ||            // or, matrix has the wrong size
-         PyArray_DIMS( *out_costs )[0] != cost_size )
-    {
-        Py_XDECREF( *out_costs );
-        // Allocate new matrix
-        *out_costs = (PyArrayObject *) PyArray_ZEROS( 1, &cost_size, NPY_FLOAT32, 0 );
-        if ( NULL == (*out_costs) )
-        {
-            // Destroy previous CTC context before returning exception
-            ctc_context_destroy( context );
-            PyErr_Format( PyExc_MemoryError,
-                "ConnectionistTemporalClassification: Could not allocate memory for CTC costs" );
-            return 1;
-        }
-    }
-    costs = (npy_float32 *) PyArray_DATA( *out_costs );
-    npy_float32 * gradients = NULL;
-    if ( NULL != out_gradients )  // If gradient computation is not disabled
-    {
-        if ( NULL == (*out_gradients) ||  // Symbolic variable has no real backing
-            PyArray_NDIM( *out_gradients ) != 3 ||
-            PyArray_DIMS( *out_gradients )[0] != PyArray_DIMS( in_activations )[0] ||
-            PyArray_DIMS( *out_gradients )[1] != PyArray_DIMS( in_activations )[1] ||
-            PyArray_DIMS( *out_gradients )[2] != PyArray_DIMS( in_activations )[2] )
-        {
-            // Existing matrix is the wrong size. Make a new one.
-            // Decrement ref counter to existing array
-            Py_XDECREF( *out_gradients );
-            // Allocate new array
-            *out_gradients = (PyArrayObject *) PyArray_ZEROS(3, PyArray_DIMS( in_activations ),
-                NPY_FLOAT32, 0);
-            if ( NULL == (*out_gradients) )
-            {
-                // Destroy previous CTC context before returning exception
-                ctc_context_destroy( context );
-                PyErr_Format( PyExc_MemoryError,
-                    "ConnectionistTemporalClassification: Could not allocate memory for CTC gradients!" );
-                return 1;
-            }
-        }
-        gradients = (npy_float32 *) PyArray_DATA( *out_gradients );
-    }
-    size_t cpu_workspace_size;
-    int ctc_error;
-    ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
-        context->input_lengths, alphabet_size, minibatch_size, context->options,
-        &cpu_workspace_size ),
-        "Failed to obtain CTC workspace size." );
-    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        return 1;
-    }
-    context->workspace = malloc( cpu_workspace_size );
-    if ( NULL == context->workspace )
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        PyErr_Format( PyExc_MemoryError,
-            "ConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
-        return 1;
-    }
-    ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
-        context->flat_labels, context->label_lengths, context->input_lengths,
-        alphabet_size, minibatch_size, costs, context->workspace,
-        context->options ), "Failed to compute CTC loss function." );
-    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
-    {
-        ctc_context_destroy( context );
-        return 1;
-    }
-    ctc_context_destroy( context );
-    return 0;
-}
--- a/pytensor/tensor/nnet/conv.py
+++ b/pytensor/tensor/nnet/conv.py
-"""
-Contains an Op for convolving input images with a set of filters. This was
-developed especially for Convolutional Neural Networks.
-For related ops, including downsampling and subsampling, see
-tensor.signal and tensor.signal.pool.
-See especially conv2d().
-"""
-import logging
-import warnings
-import numpy as np
-try:
-    from scipy.signal.signaltools import _bvalfromboundary, _valfrommode
-    from scipy.signal.sigtools import _convolve2d
-except ImportError:
-    from scipy.signal._signaltools import _bvalfromboundary, _valfrommode
-    from scipy.signal._sigtools import _convolve2d
-import pytensor
-from pytensor.graph.basic import Apply
-from pytensor.link.c.op import OpenMPOp
-from pytensor.tensor import blas
-from pytensor.tensor.basic import as_tensor_variable, get_scalar_constant_value
-from pytensor.tensor.exceptions import NotScalarConstantError
-from pytensor.tensor.nnet.abstract_conv import (
-    get_conv_output_shape,
-    get_conv_shape_1axis,
-)
-from pytensor.tensor.shape import specify_broadcastable
-from pytensor.tensor.type import discrete_dtypes, tensor
-__docformat__ = "restructuredtext en"
-_logger = logging.getLogger("pytensor.tensor.nnet.conv")
-def conv2d(
-    input,
-    filters,
-    image_shape=None,
-    filter_shape=None,
-    border_mode="valid",
-    subsample=(1, 1),
-    **kargs,
-):
-    """Build the symbolic graph for convolving a stack of input images with a set of filters.
-    The implementation is modelled after Convolutional Neural Networks
-    (CNN). It is simply a wrapper to the `ConvOp` but provides a much cleaner
-    interface.
-    This is deprecated.
-    Parameters
-    ----------
-    input : symbolic 4D tensor
-        Mini-batch of feature map stacks, of shape
-        (batch size, stack size, nb row, nb col)
-        see the optional parameter image_shape
-    filters: symbolic 4D tensor
-        Set of filters used in CNN layer of shape
-        (nb filters, stack size, nb row, nb col)
-        see the optional parameter filter_shape
-    border_mode : {'valid', 'full'}
-       'valid'only apply filter to complete patches of the image. Generates
-       output of shape: image_shape - filter_shape + 1.
-       'full' zero-pads image to multiple of filter shape to generate output
-       of shape: image_shape + filter_shape - 1.
-    subsample: tuple of len 2
-        Factor by which to subsample the output. Also called strides elsewhere.
-    image_shape: None, tuple/list of len 4 of int, None or Constant variable
-        The shape of the input parameter.
-        Optional, used for optimization like loop unrolling
-        You can put None for any element of the list to tell that this element
-        is not constant.
-    filter_shape : None, tuple/list of len 4 of int, None or Constant variable
-        Optional, used for optimization like loop unrolling
-        You can put None for any element of the list
-        to tell that this element is not constant.
-    kwargs
-        Kwargs are passed onto ConvOp. Can be used to set the following:
-        unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc).
-        openmp: By default have the same value as
-                config.openmp. For small image, filter,
-                batch size, nkern and stack size, it can be
-                faster to disable manually openmp. A fast and
-                incomplete test show that with image size
-                6x6, filter size 4x4, batch size==1,
-                n kern==1 and stack size==1, it is faster
-                to disable it in valid mode. But if we
-                grow the batch size to 10, it is faster
-                with openmp on a core 2 duo.
-    Returns
-    -------
-    symbolic 4D tensor
-        Set of feature maps generated by convolutional layer. Tensor is
-        of shape (batch size, nb filters, output row, output col).
-    """
-    warnings.warn(
-        "pytensor.tensor.nnet.conv.conv2d is deprecated."
-        " Use pytensor.tensor.nnet.conv2d instead.",
-        DeprecationWarning,
-    )
-    # accept Constant value for image_shape and filter_shape.
-    if image_shape is not None:
-        image_shape = list(image_shape)
-        for i in range(len(image_shape)):
-            if image_shape[i] is not None:
-                try:
-                    image_shape[i] = get_scalar_constant_value(
-                        as_tensor_variable(image_shape[i])
-                    )
-                except NotScalarConstantError:
-                    raise NotScalarConstantError(
-                        "The convolution need that the shape"
-                        " information are constant values. We got"
-                        " {image_shape[i]} for the image_shape parameter"
-                    )
-                assert image_shape[i].dtype in discrete_dtypes
-                image_shape[i] = int(image_shape[i])
-    if filter_shape is not None:
-        filter_shape = list(filter_shape)
-        for i in range(len(filter_shape)):
-            if filter_shape[i] is not None:
-                try:
-                    filter_shape[i] = get_scalar_constant_value(
-                        as_tensor_variable(filter_shape[i])
-                    )
-                except NotScalarConstantError:
-                    raise NotScalarConstantError(
-                        "The convolution need that the shape"
-                        " information are constant values. We got"
-                        " {filter_shape[i]} for the filter_shape "
-                        "parameter"
-                    )
-                assert filter_shape[i].dtype in discrete_dtypes
-                filter_shape[i] = int(filter_shape[i])
-    if image_shape and filter_shape:
-        try:
-            if image_shape[1] is not None and filter_shape[1] is not None:
-                assert image_shape[1] == filter_shape[1]
-        except Exception:
-            print("image ", image_shape, " filters ", filter_shape)
-            raise
-    if filter_shape is not None:
-        nkern = filter_shape[0]
-        kshp = filter_shape[2:]
-    else:
-        nkern, kshp = None, None
-    if image_shape is not None:
-        bsize = image_shape[0]
-        imshp = image_shape[1:]
-    else:
-        bsize, imshp = None, None
-    op = ConvOp(
-        output_mode=border_mode,
-        dx=subsample[0],
-        dy=subsample[1],
-        imshp=imshp,
-        kshp=kshp,
-        nkern=nkern,
-        bsize=bsize,
-        **kargs,
-    )
-    return op(input, filters)
-class ConvOp(OpenMPOp):
-    r"""
-    This Op serves a dual purpose: it can implement a vanilla 2D convolution
-    (as taught in any signal processing class) or implement the
-    convolutional layers found in Convolutional Neural Networks.
-    In this setting, a set of 3D images is convolved with a set of 3D kernels,
-    with the particularity that their leading dimensions are of equal length.
-    Vanilla 2D convolution is treated as a special case of this.
-    The input parameter represents a mini-batch of multiple images. Its shape is:
-        batch size x num. input feature maps x image height x image width
-    The kernel parameter represents a set of 3D kernels. Its shape is:
-        number of filters x num. input images x filter height x filter width
-    The output of ConvOp is a 4D tensor, generated as follows:
-        output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
-    where b is the mini-batch index, k the filter index and * is the
-    convolution operator.
-    The constructor initializes a ConvOp with given output_mode (full/valid).
-    All other parameters are optional and are only used to generate more
-    optimized c code, or to enable graph optimizers to optimally replace the
-    ConvOp.
-    NOTES ON OPTIMIZATION:
-    There are two types of optimization. The first is the selection of the
-    fastest algo when bsize and nkern are provided with imshp and kshp.
-    By default we try to select the fastest version. You can specify it
-    with the unroll_batch, unroll_kern, and unroll_patch parameter.
-    The second type of optimization is hardcoding some dimensions into the
-    code when all shape are know.
-    This make a significant difference for the 'full' output_mode.
-    Sometimes, the fastest implementation on x86-64 uses
-    {unroll_batch=4, unroll_kern=4, unroll_patch=False}
-    with all other shape parameters being provided.
-    For optimizing other architectures, see:
-    Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
-    Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
-    Software, May 2008.
-    Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
-    Parameters
-    ----------
-    output_mode : {'valid', 'full'}
-        'valid' gives an output smaller then the image.
-        'full' gives an output bigger then the image.
-         See 'border_mode' in conv2d's doc.
-    Optional parameters: (will generate more optimal c code)
-    imshp : tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
-        Stacksize, nb image row, nb image col.
-    kshp : tuple of len 2
-        Nb kernel row, nb kernel col.
-    nkern : int
-        The number of kernel.
-    bsize : int
-        The size of the minibatch.
-    dx : int
-        Patch stride rows.
-    dy : int
-        Patch stride cols
-    Params which select the version of code used:
-    unroll_patch : bool
-        Use a version of c_code that unroll the patch loop that don't
-        request all shape information to work, but if all shape information
-        are present, will use it to hardcode the value in the code for
-        faster code.
-    unroll_batch : int
-        Use a version of c_code that unroll the batch (by unroll_batch)
-        and the nkern (by unroll_kern) loop. The size must by a multiple
-        of bsize or nkern respectively.
-    unroll_kern : int
-        Use a version of c_code that unroll the batch
-        (by unroll_batch) and the nkern(by unroll_kern) loop. The size
-        must by a multiple of bsize or nkern respectively.
-    The 3 following parameters are used internally when we generate
-    the gradient when dx!=1 or dy!=1.
-    imshp_logical
-        Default None. None value is equivalent to imshp value.
-        When imshp_logical != imshp, it tell we need to insert 0 in
-        the image before we do the convolution. For example, when dx==dy==2
-        and the image is [[1, 2], [3, 4]], we should make as if the image
-        was [[1, 0, 2, 0], [0, 0, 0, 0], [3, 0, 4, 0], [0, 0, 0, 0]].
-        Our python code insert the zero, but the c code optimize it.
-        imshp_logical != imshp when taking the grad again the weights or
-        the image when the output_mode is full and `dx != 1` or `dy != 1`.
-    kshp_logical
-        Idem but for kshp and used for the grad again the
-        weights when the output_mode is valid and `dx != 1` or `dy != 1`.
-    kshp_logical_top_aligned
-        Used in the same case. Default to True.
-        Set to False in the grad again the weight when the
-        output_mode is full.
-    """
-    __attrnames = [
-        "imshp",
-        "kshp",
-        "nkern",
-        "bsize",
-        "dx",
-        "dy",
-        "out_mode",
-        "unroll_batch",
-        "unroll_kern",
-        "unroll_patch",
-        "imshp_logical",
-        "kshp_logical",
-        "kshp_logical_top_aligned",
-    ]
-    """These attributes uniquely identify the behaviour of this op for
-    given inputs. Do not set openmp here.
-    """
-    # the value of speed_unroll_batch_kern,speed_unroll_patch_noshape,speed_unroll_patch_shape
-    # have bean calculated on maggie36 when their is only 1 session logged on and only this was running.
-    # It is an Intel(R) Xeon(R) CPU E5430 @ 2.66GHz. It is computer with pytensor/tensor/nnet/tests/speed_test_conv.py
-    # and took 5 minutes to run.
-    # TODO: we should compute this table for each computer/os as this can change.
-    #      I saw on one computer that the speed with the shape can be slower than without!
-    #      using the real shape and the same dtype could also help.
-    # unroll_batch, unroll_kern, valid time, full time
-    speed_unroll_batch_kern = [
-        (1, 1, 2.4661250114440918, 6.5472931861877441),
-        (1, 2, 1.5869178771972656, 5.1499760150909424),
-        (1, 3, 1.4270510673522949, 3.6593470573425293),
-        (1, 4, 1.3373479843139648, 3.3451821804046631),
-        (1, 5, 1.2818830013275146, 3.1444568634033203),
-        (1, 6, 1.2521560192108154, 3.0256359577178955),
-        (1, 10, 1.2134110927581787, 2.9174180030822754),
-        (2, 1, 1.657214879989624, 4.5261678695678711),
-        (2, 2, 1.2123160362243652, 2.9747390747070312),
-        (2, 3, 1.0758891105651855, 2.5690360069274902),
-        (2, 4, 1.0683329105377197, 2.4233770370483398),
-        (2, 5, 1.0955719947814941, 2.3999948501586914),
-        (2, 6, 1.5935721397399902, 2.6878271102905273),
-        (2, 10, 1.8511250019073486, 3.2417428493499756),
-        (3, 1, 1.5948119163513184, 3.631148099899292),
-        (3, 2, 1.0761330127716064, 2.6011371612548828),
-        (3, 3, 1.0551531314849854, 2.4200370311737061),
-        (3, 4, 1.3930759429931641, 2.5211219787597656),
-        (3, 5, 1.4330689907073975, 2.5704989433288574),
-        (3, 6, 1.362138032913208, 2.5964410305023193),
-        (3, 10, 1.6582000255584717, 2.9907989501953125),
-        (4, 1, 1.4793620109558105, 3.3473429679870605),
-        (4, 2, 1.0671560764312744, 2.4171769618988037),
-        (4, 3, 1.2569692134857178, 2.2807950973510742),
-        (4, 4, 1.3456289768218994, 2.6219108104705811),
-        (4, 5, 1.4055080413818359, 2.4606490135192871),
-        (4, 6, 1.372107982635498, 2.551663875579834),
-        (4, 10, 1.599470853805542, 2.9172940254211426),
-        (5, 1, 1.4115700721740723, 3.2077109813690186),
-        (5, 2, 1.0635769367218018, 2.2648060321807861),
-        (5, 3, 1.3842809200286865, 2.6135518550872803),
-        (5, 4, 1.3470511436462402, 2.3852400779724121),
-        (5, 5, 1.3539440631866455, 2.5245928764343262),
-        (5, 6, 1.4037849903106689, 2.5985310077667236),
-        (5, 10, 1.6120610237121582, 2.8127608299255371),
-        (6, 1, 1.3623628616333008, 3.021122932434082),
-        (6, 2, 1.1697649955749512, 2.6285450458526611),
-        (6, 3, 1.2980999946594238, 2.4746189117431641),
-        (6, 4, 1.3739941120147705, 2.5579929351806641),
-        (6, 5, 1.3967819213867188, 2.5522029399871826),
-        (6, 6, 1.4279270172119141, 2.6127138137817383),
-        (6, 10, 1.605496883392334, 2.864037036895752),
-        (10, 1, 1.6401121616363525, 2.970099925994873),
-        (10, 2, 1.46710205078125, 2.7231831550598145),
-        (10, 3, 1.4193780422210693, 2.6087639331817627),
-        (10, 4, 1.4657118320465088, 2.6246678829193115),
-        (10, 5, 1.5052611827850342, 2.6542458534240723),
-        (10, 6, 1.5214400291442871, 2.7243161201477051),
-        (10, 10, 1.6116268634796143, 2.956165075302124),
-    ]
-    # valid time, full time
-    speed_unroll_patch_noshape = [2.0109100341796875, 5.8175678253173828]
-    # valid time, full time
-    speed_unroll_patch_shape = [1.2967290878295898, 5.5283889770507812]
-    @staticmethod
-    def has_all_shape(imshp, kshp, nkern=1, bsize=1):
-        return (
-            nkern is not None
-            and bsize is not None
-            and all(shp is not None for shp in imshp)
-            and all(shp is not None for shp in kshp)
-        )
-    @staticmethod
-    def getOutputShape(inshp, kshp, stride=(1, 1), mode="valid"):
-        """
-        Computes the output dimensions of convolving an image of shape "inshp"
-        with kernels of shape "kshp". Accepts symbolic or integer shapes.
-        Propagates `None`s (for unknown shapes).
-        Parameters
-        ----------
-        inshp
-            (rows,cols) of input image.
-        kshp
-            (rows,cols) of filters.
-        mode: {'valid', 'full'}
-            See 'border_mode' in conv2d's doc.
-        Returns
-        -------
-        object
-            (rows,cols) of output image.
-        """
-        # The formula would be ceil((i + s * k - s * 1) / float(d)),
-        # with s=1 for mode=='full' and s=-1 for mode=='valid'.
-        # To support symbolic shapes, we express this with integer arithmetic.
-        warnings.warn(
-            "`getOutputShape` is deprecated; use `get_conv_output_shape` instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return tuple(
-            get_conv_shape_1axis(i, k, mode, d) for i, k, d in zip(inshp, kshp, stride)
-        )
-    def __init__(
-        self,
-        imshp=None,
-        kshp=None,
-        nkern=None,
-        bsize=None,
-        dx=1,
-        dy=1,
-        output_mode="valid",
-        unroll_batch=None,
-        unroll_kern=None,
-        unroll_patch=None,
-        imshp_logical=None,
-        kshp_logical=None,
-        kshp_logical_top_aligned=True,
-        verbose=False,
-        openmp=None,
-    ):
-        # Expand unknown image / kernel shapes into tuples of Nones
-        if imshp is None:
-            imshp = (None, None, None)
-        else:
-            imshp = tuple(imshp)
-        if kshp is None:
-            kshp = (None, None)
-        else:
-            kshp = tuple(kshp)
-        # Check imshp and kshp dimensionality
-        if len(imshp) == 2:
-            imshp = (1,) + imshp
-        elif len(imshp) != 3:
-            raise ValueError(f"len(imshp) must be 2 or 3, got {len(imshp)}")
-        if len(kshp) != 2:
-            raise ValueError(f"len(kshp) must be 2, got {len(kshp)}")
-        # We must continue to consider None as 1 for backward compatibility.
-        if dx is None:
-            dx = 1
-        if dy is None:
-            dy = 1
-        if int(dx) != dx:
-            raise TypeError("ConvOp.__init__ param dx must be an int", dx)
-        dx = int(dx)
-        if int(dy) != dy:
-            raise TypeError("ConvOp.__init__ param dy must be an int", dy)
-        dy = int(dy)
-        all_shape = self.has_all_shape(imshp, kshp, nkern, bsize)
-        if (unroll_batch or unroll_kern) and not all_shape:
-            raise ValueError(
-                "In ConvOp, when using unroll_batch and"
-                " unroll_nkern, all shape are needed"
-            )
-        # Init the openmp attribute
-        super().__init__(openmp=openmp)
-        if not all_shape or self.openmp:
-            # Only this version is parallelized
-            unroll_patch = True
-        self.verbose = verbose
-        self.imshp = imshp
-        self.kshp = kshp
-        self.nkern = nkern
-        self.bsize = bsize
-        self.dx = dx
-        self.dy = dy
-        # a triple
-        if imshp_logical is None:
-            self.imshp_logical = self.imshp
-        else:
-            imshp_logical = tuple(imshp_logical)
-            if len(imshp_logical) != 3:
-                raise ValueError(
-                    f"len(imshp_logical) must be 3, got {len(imshp_logical)}"
-                )
-            self.imshp_logical = imshp_logical
-        # a pair
-        if kshp_logical is None:
-            self.kshp_logical = self.kshp
-        else:
-            kshp_logical = tuple(kshp_logical)
-            if len(kshp_logical) != 2:
-                raise ValueError(
-                    f"len(kshp_logical) must be 2, got {len(kshp_logical)}"
-                )
-            self.kshp_logical = kshp_logical
-        # a bool
-        self.kshp_logical_top_aligned = kshp_logical_top_aligned
-        self.unroll_batch = unroll_batch
-        self.unroll_kern = unroll_kern
-        self.unroll_patch = unroll_patch
-        if self.unroll_batch and not self.unroll_kern:
-            self.unroll_kern = 1
-        if self.unroll_kern and not self.unroll_batch:
-            self.unroll_batch = 1
-        # downcast unroll_batch if not a divisor of batch size
-        if (
-            self.unroll_batch is not None
-            and self.unroll_batch > 0
-            and self.bsize % self.unroll_batch != 0
-        ):
-            if self.bsize <= self.unroll_batch:
-                self.unroll_batch = self.bsize
-            else:
-                # find the maximum value under unroll_batch that would work
-                new = self.unroll_batch
-                assert new >= 1
-                while self.bsize % new != 0:
-                    new -= 1
-                warnstr = (
-                    "In ConvOp.__init__(): "
-                    f"unroll_batch({self.unroll_batch}) must be 0 or a divisor of"
-                    f" bsize({self.bsize}). We revert it to {new}. This"
-                    " won't change the result, but may make it slower."
-                )
-                _logger.warning(warnstr)
-                self.unroll_batch = new
-        # downcast unroll_kern if not a divisor of nb of kernel
-        if (
-            self.unroll_kern is not None
-            and self.unroll_kern > 0
-            and self.nkern % self.unroll_kern != 0
-        ):
-            if self.nkern <= self.unroll_kern:
-                self.unroll_kern = self.nkern
-            else:
-                # find the maximum value under unroll_kern that would work
-                new = self.unroll_kern
-                assert new >= 1
-                while self.nkern % new != 0:
-                    new -= 1
-                warnstr = (
-                    "In ConvOp.__init__(): "
-                    f"unroll_kern({self.unroll_kern}) must be 0 or a divisor of"
-                    f" nkern({self.nkern}). We revert it to {new}. This"
-                    " won't change the result, but may make it slower."
-                )
-                _logger.warning(warnstr)
-                self.unroll_kern = new
-        self.outshp = get_conv_output_shape(
-            (None,) + self.imshp_logical,
-            (
-                None,
-                None,
-            )
-            + self.kshp_logical,
-            output_mode,
-            (dx, dy),
-        )[2:]
-        self.fulloutshp = get_conv_output_shape(
-            (None,) + self.imshp_logical,
-            (
-                None,
-                None,
-            )
-            + self.kshp_logical,
-            output_mode,
-            (1, 1),
-        )[2:]
-        self.out_mode = output_mode
-        if self.out_mode not in ("valid", "full"):
-            raise NotImplementedError(f"Mode {self.out_mode} not implemented")
-        if any((shp is not None) and (shp <= 0) for shp in self.outshp):
-            raise ValueError(
-                "Bad size for the output shape. Verify that [post-"
-                f"supersampling] input shape ({self.imshp_logical}) and kern"
-                f" shape({self.kshp_logical}) are ok. (Hint: kerns must fit inside"
-                " image in valid mode)"
-            )
-        if (
-            self.unroll_kern is None
-            and self.unroll_batch is None
-            and self.unroll_patch is None
-        ):
-            # no version specified. Find the faster we have
-            if self.bsize is None and self.nkern is None:
-                self.unroll_patch = True
-            elif self.bsize is not None and self.nkern is not None:
-                bsize = self.bsize
-                nkern = self.nkern
-                mode_idx = 0
-                if self.out_mode != "valid":
-                    mode_idx = 1
-                if self.has_all_shape(self.imshp, self.kshp):
-                    time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
-                else:
-                    time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx]
-                time_unroll_batch_kern = 9999999
-                for i in range(len(self.speed_unroll_batch_kern)):
-                    if (
-                        bsize % self.speed_unroll_batch_kern[i][0] == 0
-                        and nkern % self.speed_unroll_batch_kern[i][1] == 0
-                    ):
-                        if (
-                            self.speed_unroll_batch_kern[i][2 + mode_idx]
-                            < time_unroll_batch_kern
-                        ):
-                            time_unroll_batch_kern = self.speed_unroll_batch_kern[i][
-                                2 + mode_idx
-                            ]
-                            time_unroll_batch_kern_idx = i
-                if time_unroll_patch < time_unroll_batch_kern:
-                    self.unroll_patch = True
-                else:
-                    self.unroll_batch = self.speed_unroll_batch_kern[
-                        time_unroll_batch_kern_idx
-                    ][0]
-                    self.unroll_kern = self.speed_unroll_batch_kern[
-                        time_unroll_batch_kern_idx
-                    ][1]
-                    self.unroll_patch = False
-            _logger.debug(
-                "AUTO FIND VERSION OF C_CODE OF CONV OP %s %s %s %s %s %s %s",
-                self.unroll_batch,
-                self.unroll_kern,
-                self.unroll_patch,
-                self.bsize,
-                self.nkern,
-                time_unroll_patch,
-                time_unroll_batch_kern,
-            )
-        self._rehash()
-    def __eq__(self, other):
-        if type(self) != type(other):
-            return False
-        for a in self.__attrnames:
-            if getattr(self, a) != getattr(other, a):
-                return False
-        return True
-    def __setstate__(self, d):
-        super().__setstate__(d)
-        self._rehash()
-    def _rehash(self):
-        hashval = hash(type(self))
-        for a in self.__attrnames:
-            hashval = hashval ^ hash(getattr(self, a))
-        self.__hashval = hashval
-    def __hash__(self):
-        return self.__hashval
-    def __str__(self):
-        return (
-            "ConvOp{"
-            + ",".join(str((a, getattr(self, a))) for a in self.__attrnames)
-            + "}"
-        )
-    def flops(self, inputs, outputs):
-        """
-        Useful with the hack in profiling to print the MFlops.
-        """
-        images, kerns = inputs
-        (out,) = outputs
-        assert images[1] == kerns[1]
-        flops = 0
-        if self.out_mode == "valid":
-            # nb mul and add by output pixel
-            flops = kerns[2] * kerns[3] * 2
-            # nb flops by output image
-            flops *= out[2] * out[3]
-            # nb patch multiplied
-            flops *= images[1] * kerns[0] * images[0]
-        else:
-            flops = (
-                images[0]
-                * kerns[0]
-                * images[1]
-                * kerns[2]
-                * kerns[3]
-                * images[2]
-                * images[3]
-                * 2
-            )
-        return flops
-    def make_node(self, inputs, kerns):
-        # TODO: find a way to make ConvOp work for N-D (after NIPS09)
-        """
-        Parameters
-        ----------
-        inputs
-            4 dim: batches x stacksize x rows x cols.
-        kerns
-            4 dim: nkern x stackidx x rows x cols.
-        """
-        _inputs = as_tensor_variable(inputs)
-        _kerns = as_tensor_variable(kerns)
-        # TODO: lift this restriction by upcasting either inputs or kerns
-        if _inputs.ndim != 4:
-            raise TypeError(
-                "ConvOp (make_node) requires input be a 4D tensor;"
-                f' received "{inputs}" ({_inputs.ndim} dims)'
-            )
-        if _kerns.ndim != 4:
-            raise TypeError("make_node requires 4D tensor of kernels")
-        if _inputs.type.dtype != _kerns.type.dtype:
-            raise NotImplementedError(
-                "The image and the kernel must have the same type."
-                "inputs({_inputs.dtype}), kerns({_kerns.dtype})"
-            )
-        out_shape = (
-            _inputs.type.shape[0],
-            _kerns.type.shape[0],
-            self.outshp[0],
-            self.outshp[1],
-        )
-        out_shape = tuple(1 if s == 1 else None for s in out_shape)
-        output = tensor(
-            dtype=_inputs.type.dtype,
-            shape=out_shape,
-        )
-        return Apply(self, [_inputs, _kerns], [output])
-    def infer_shape(self, fgraph, node, input_shapes):
-        imshp = input_shapes[0]  # 4D image shape
-        kshp = input_shapes[1]  # 4D filter shape
-        bsize, imshp = imshp[0], list(imshp[1:])
-        nkern, kshp = kshp[0], list(kshp[2:])
-        # replace symbolic shapes with known shapes
-        if self.bsize is not None:
-            bsize = self.bsize
-        for i in (0, 1, 2):
-            if self.imshp_logical[i] is not None:
-                imshp[i] = self.imshp_logical[i]
-        if self.nkern is not None:
-            nkern = self.nkern
-        for i in (0, 1):
-            if self.kshp_logical[i] is not None:
-                kshp[i] = self.kshp_logical[i]
-        # infer output shape from what we have
-        res = get_conv_output_shape(
-            (bsize,) + tuple(imshp),
-            (
-                nkern,
-                None,
-            )
-            + tuple(kshp),
-            self.out_mode,
-            (self.dx, self.dy),
-        )
-        return [res]
-    def perform(self, node, inp, out):
-        """
-        By default if len(img2d.shape)==3, we TODO
-        """
-        img2d, filtersflipped = inp
-        (z,) = out
-        # TODO: move these back out to global scope when they no longer
-        #       cause an atexit error
-        imshp = self.imshp
-        if any(x is None for x in imshp):
-            imshp = tuple(img2d.shape[1:])
-        if imshp != img2d.shape[1:]:
-            raise ValueError(
-                "The image shape provided at build time "
-                "is different from the one passed at run time",
-                imshp,
-                img2d.shape[1:],
-            )
-        kshp = self.kshp
-        if any(x is None for x in kshp):
-            kshp = tuple(filtersflipped.shape[2:])
-        if kshp != filtersflipped.shape[2:]:
-            raise ValueError(
-                "The filter shape provided at build time "
-                "is different from the one passed at run time",
-                kshp,
-                filtersflipped.shape[2:],
-            )
-        bsize = self.bsize
-        if bsize is None:
-            bsize = img2d.shape[0]
-        elif bsize != img2d.shape[0]:
-            raise ValueError(
-                "The batch size provided at build time "
-                "is different from the one passed at run time",
-                bsize,
-                img2d.shape[0],
-            )
-        nkern = self.nkern
-        if nkern is None:
-            nkern = filtersflipped.shape[0]
-        elif nkern != filtersflipped.shape[0]:
-            raise ValueError(
-                "The number of filters provided at build time "
-                "is different from the one passed at run time",
-                nkern,
-                filtersflipped.shape[0],
-            )
-        imshp_logical = self.imshp_logical
-        if imshp_logical[0] is None:
-            imshp_logical = (imshp[0],) + imshp_logical[1:]
-        if imshp_logical[1] is None:
-            imshp_logical = (imshp_logical[0], imshp[1], imshp_logical[2])
-        if imshp_logical[2] is None:
-            imshp_logical = imshp_logical[:2] + (imshp[2],)
-        assert all(x is not None for x in imshp_logical)
-        kshp_logical = self.kshp_logical
-        if kshp_logical[0] is None:
-            kshp_logical = (kshp[0], kshp_logical[1])
-        if kshp_logical[1] is None:
-            kshp_logical = (kshp_logical[0], kshp[1])
-        assert all(x is not None for x in kshp_logical)
-        if all(shp is not None for shp in self.fulloutshp):
-            fulloutshp = tuple(self.fulloutshp)
-        else:
-            fulloutshp = get_conv_output_shape(
-                (None,) + imshp_logical,
-                (
-                    None,
-                    None,
-                )
-                + kshp_logical,
-                self.out_mode,
-                (1, 1),
-            )[2:]
-        if (
-            z[0] is None
-            or z[0].shape
-            != (
-                bsize,
-                nkern,
-            )
-            + fulloutshp
-        ):
-            z[0] = np.zeros(
-                (
-                    bsize,
-                    nkern,
-                )
-                + fulloutshp,
-                dtype=img2d.dtype,
-            )
-        zz = z[0]
-        stacklen = imshp[0]
-        img2d = img2d.reshape((bsize,) + imshp)
-        filtersflipped = filtersflipped.reshape((nkern, stacklen) + kshp)
-        if self.imshp != self.imshp_logical:
-            # assuming that to get from imshp to imshp logical we insert zeros in missing spots
-            rstride = int(np.ceil(imshp_logical[1] / float(imshp[1])))
-            cstride = int(np.ceil(imshp_logical[2] / float(imshp[2])))
-            buf = np.zeros((bsize,) + imshp_logical, dtype=img2d.dtype)
-            buf[:, :, ::rstride, ::cstride] = img2d
-            img2d = buf
-            del buf, rstride, cstride
-        if kshp != kshp_logical:
-            rstride = int(np.ceil(kshp_logical[0] / float(kshp[0])))
-            cstride = int(np.ceil(kshp_logical[1] / float(kshp[1])))
-            buf = np.zeros(
-                (nkern, stacklen) + self.kshp_logical, dtype=filtersflipped.dtype
-            )
-            if self.kshp_logical_top_aligned:
-                roffset = coffset = 0
-            else:
-                roffset = (
-                    kshp_logical[0] - (kshp[0] * rstride) - 1 + rstride
-                ) % rstride
-                coffset = (
-                    kshp_logical[1] - (kshp[1] * cstride) - 1 + cstride
-                ) % cstride
-                assert roffset >= 0
-                assert coffset >= 0
-            buf[:, :, roffset::rstride, coffset::cstride] = filtersflipped
-            filtersflipped = buf
-            del buf, rstride, cstride
-        val = _valfrommode(self.out_mode)
-        bval = _bvalfromboundary("fill")
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", np.ComplexWarning)
-            for b in range(bsize):
-                for n in range(nkern):
-                    zz[b, n, ...].fill(0)
-                    for im0 in range(stacklen):
-                        # some cast generates a warning here
-                        zz[b, n, ...] += _convolve2d(
-                            img2d[b, im0, ...],
-                            filtersflipped[n, im0, ...],
-                            1,
-                            val,
-                            bval,
-                            0,
-                        )
-        if False:
-            if False and self.out_mode == "full":
-                img2d2 = np.zeros(
-                    (
-                        bsize,
-                        stacklen,
-                        imshp[1] + 2 * kshp[0] - 2,
-                        imshp[2] + 2 * kshp[1] - 2,
-                    )
-                )
-                img2d2[
-                    :,
-                    :,
-                    kshp[0] - 1 : kshp[0] - 1 + imshp[1],
-                    kshp[1] - 1 : kshp[1] - 1 + imshp[2],
-                ] = img2d
-                img2d = img2d2
-            # N_image_shape = image_data.shape
-            for b in range(bsize):
-                for n in range(nkern):
-                    zz[b, n, ...].fill(0)
-                    for im0 in range(stacklen):
-                        for row in range(0, zz.shape[2], self.dx):
-                            for col in range(0, zz.shape[3], self.dy):
-                                zz[b, n, row, col] += (
-                                    img2d[
-                                        b, im0, row : row + kshp[0], col : col + kshp[1]
-                                    ]
-                                    * filtersflipped[n, im0, ::-1, ::-1]
-                                ).sum()
-        # We copy it to remove the Stride mismatch warning from DEBUG_MODE.
-        # The copy make that we return an object with the same stride as the c version.
-        # The copy don't affect the performance during our experience as in that case we
-        # execute the c version which is much faster.
-        if self.dx > 1 or self.dy > 1:
-            zz = zz[:, :, 0 :: self.dx, 0 :: self.dy].copy()
-        z[0] = zz
-    def R_op(self, inputs, eval_points):
-        rval = None
-        if eval_points[0] is not None:
-            rval = self.make_node(eval_points[0], inputs[1]).outputs[0]
-        if eval_points[1] is not None:
-            if rval is None:
-                rval = self.make_node(inputs[0], eval_points[1]).outputs[0]
-            else:
-                rval += self.make_node(inputs[0], eval_points[1]).outputs[0]
-        return [rval]
-    def grad(self, inp, grads):
-        inputs, kerns = inp
-        (gz,) = grads
-        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
-            raise NotImplementedError("todo")
-        if self.out_mode == "valid" and (self.dx, self.dy) != (1, 1):
-            raise NotImplementedError(
-                "ERROR: ConvOp.grad is now disabled for 'valid' convolutions with"
-                " stride != (1, 1); call pytensor.tensor.nnet.conv2d() instead."
-            )
-        if self.dx not in (1, 2) or self.dy not in (1, 2):
-            raise NotImplementedError(
-                "ERROR: We disable ConvOp.grad now when output_mode is not"
-                " 'valid' and dx or dy are greater than 2, as there is a bug"
-                " in it. See `abstract_conv2d <>`_ for a version that support this."
-            )
-        all_shape = self.has_all_shape(self.imshp, self.kshp, self.nkern, self.bsize)
-        if not all_shape and (self.dx != 1 or self.dy != 1):
-            raise ValueError(
-                "ConvOp.grad when dx!=1 or dy!=1 we must have all "
-                "the optional shape information"
-            )
-        # Determine gradient on kernels ########
-        assert inputs.ndim == 4 and kerns.ndim == 4
-        newin = inputs.dimshuffle((1, 0, 2, 3))
-        newgz = gz.dimshuffle((1, 0, 2, 3))
-        if self.out_mode == "valid":
-            (img, filters) = (newin, newgz)
-            kshp_logical = self.fulloutshp
-            kshp_logical_top_aligned = False
-            imshp_logical = None
-            (bsize, nkern) = (self.imshp[0], self.nkern)
-            imshp = (self.bsize, self.imshp[1], self.imshp[2])
-            kshp = self.outshp
-        elif self.out_mode == "full":
-            (img, filters) = (newgz, newin)
-            kshp_logical = None
-            kshp_logical_top_aligned = True
-            imshp_logical = (self.bsize, self.fulloutshp[0], self.fulloutshp[1])
-            (bsize, nkern) = (self.nkern, self.imshp[0])
-            imshp = (self.bsize, self.outshp[0], self.outshp[1])
-            kshp = self.imshp[1:]
-        else:
-            raise NotImplementedError(
-                "Only [full,valid] modes are currently supported."
-            )
-        filters = filters[:, :, ::-1, ::-1]  # flip them
-        dw = ConvOp(
-            imshp,
-            kshp,
-            nkern,
-            bsize,
-            1,
-            1,
-            output_mode="valid",
-            unroll_batch=None,
-            unroll_kern=None,
-            unroll_patch=None,
-            imshp_logical=imshp_logical,
-            kshp_logical=kshp_logical,
-            kshp_logical_top_aligned=kshp_logical_top_aligned,
-            verbose=self.verbose,
-        )
-        dw = dw(img, filters)
-        if all_shape:
-            assert all(o == k for o, k in zip(dw.owner.op.outshp, self.kshp))
-        if self.out_mode == "valid":
-            # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
-            dw = dw.dimshuffle((1, 0, 2, 3))
-            dw = dw[:, :, ::-1, ::-1]
-        # Determine gradient on inputs ########
-        mode = "valid"
-        if self.out_mode != "full":
-            mode = "full"
-        filters = kerns.dimshuffle((1, 0, 2, 3))
-        filters = filters[:, :, ::-1, ::-1]
-        nkern = self.imshp[0]
-        imshp = (self.nkern, self.outshp[0], self.outshp[1])
-        imshp_logical = (self.nkern, self.fulloutshp[0], self.fulloutshp[1])
-        din = ConvOp(
-            imshp,
-            self.kshp,
-            nkern,
-            self.bsize,
-            1,
-            1,
-            output_mode=mode,
-            unroll_batch=None,
-            unroll_kern=None,
-            unroll_patch=None,
-            imshp_logical=imshp_logical,
-            kshp_logical=None,
-            verbose=self.verbose,
-        )
-        din = din(gz, filters)
-        assert all(
-            o is None or o == i for o, i in zip(din.owner.op.outshp, self.imshp[1:])
-        )
-        # din and dw should have the same broadcasting pattern as the
-        # parameters they are the gradient of (resp. inputs and kerns).
-        if din.type.broadcastable != inputs.type.broadcastable:
-            din = specify_broadcastable(
-                din, *(ax for (ax, b) in enumerate(inputs.type.broadcastable) if b)
-            )
-        if dw.type.broadcastable != kerns.type.broadcastable:
-            dw = specify_broadcastable(
-                dw, *(ax for (ax, b) in enumerate(kerns.type.broadcastable) if b)
-            )
-        return [din, dw]
-    def c_headers(self, **kwargs):
-        return ["<numpy/noprefix.h>", "<iostream>", "<sstream>"]
-    def c_code_cache_version(self):
-        return (15, self.openmp, blas.blas_header_version())
-    def c_support_code(self, **kwargs):
-        return (
-            """
-#define STRIDES(arr) (PyArray_STRIDES(arr))
-#define FULL  2
-#define SAME  1
-#define VALID 0
-#define MOD %
-using namespace std;
-"""
-            + blas.blas_header_text()
-        )
-    def use_blas(self):
-        """Return True if we will generate code that use gemm."""
-        # the gemm version only support that case
-        if self.out_mode == "valid" and self.dx == 0 and self.dy == 0:
-            # We use a faster version in those case.
-            if (
-                self.imshp != self.imshp_logical
-                or self.kshp != self.kshp_logical
-                or self.unroll_patch
-                or self.unroll_batch > 0
-                or self.unroll_kern > 0
-            ):
-                return False
-            return True
-        return False
-    def c_libraries(self, **kwargs):
-        if self.use_blas():
-            return blas.ldflags()
-        return []
-    def c_no_compile_args(self, **kwargs):
-        # when the ksph==(1,1) gcc 4.3.0 segfault during the
-        # compilation with -O3.  This don't happen at -O2
-        if pytensor.link.c.cmodule.gcc_version() in ["4.3.0"] and self.kshp == (1, 1):
-            return ["-O3"]
-        else:
-            return []
-    def c_compile_args(self, **kwargs):
-        ret = []
-        if self.use_blas():
-            ret = blas.ldflags(libs=False, flags=True)
-        if pytensor.link.c.cmodule.gcc_version() in ["4.3.0"] and self.kshp == (1, 1):
-            ret += ["-O2"]
-        # Add the -fopenmp flags
-        ret += super().c_compile_args(**kwargs)
-        return ret
-    def c_lib_dirs(self, **kwargs):
-        if self.use_blas():
-            return blas.ldflags(libs=False, libs_dir=True)
-        return []
-    def c_header_dirs(self, **kwargs):
-        if self.use_blas():
-            return blas.ldflags(libs=False, include_dir=True)
-        return []
-    def c_code(self, node, name, inp, out, sub):
-        img2d, filtersflipped = inp
-        (z,) = out
-        if node.inputs[0].type.dtype != node.inputs[1].type.dtype:
-            raise NotImplementedError()
-        assert node.inputs[0].type.dtype == node.inputs[1].type.dtype
-        d = locals()
-        d.update(sub)
-        all_shape = self.has_all_shape(
-            self.imshp, self.kshp, self.nkern, self.bsize
-        ) and self.has_all_shape(self.imshp_logical, self.kshp_logical)
-        d["self_out_mode"] = self.out_mode
-        d["self_dx"] = self.dx
-        d["self_dy"] = self.dy
-        d["mode"] = self.out_mode.upper()
-        d["affectation"] = "="
-        # Default values, will be overridden if the shape info is provided
-        d["self_bsize"] = f"PyArray_DIMS({d['img2d']})[0]"
-        d["self_nkern"] = f"PyArray_DIMS({d['filtersflipped']})[0]"
-        d["self_outshp0"] = "-1"
-        d["self_outshp1"] = "-1"
-        d["self_imshp0"] = f"PyArray_DIMS({d['img2d']})[1]"
-        d["self_imshp1"] = f"PyArray_DIMS({d['img2d']})[2]"
-        d["self_imshp2"] = f"PyArray_DIMS({d['img2d']})[3]"
-        d["self_kshp0"] = f"PyArray_DIMS({d['filtersflipped']})[2]"
-        d["self_kshp1"] = f"PyArray_DIMS({d['filtersflipped']})[3]"
-        d["assert_size"] = ""
-        # Override the default value if we have it
-        if self.kshp[0] is not None:
-            expected = d["self_kshp0"]
-            value = self.kshp[0]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of rows in the filter "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_kshp0"] = self.kshp[0]
-        if self.kshp[1] is not None:
-            expected = d["self_kshp1"]
-            value = self.kshp[1]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of columns in the filter "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_kshp1"] = self.kshp[1]
-        if self.outshp[0] is not None:
-            expected = "dim_zz[0]"
-            value = self.outshp[0]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of rows in the output "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_outshp0"] = self.outshp[0]
-        if self.outshp[1] is not None:
-            expected = "dim_zz[1]"
-            value = self.outshp[1]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of columns in the output "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_outshp1"] = self.outshp[1]
-        if self.imshp[0] is not None:
-            expected = d["self_imshp0"]
-            value = self.imshp[0]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the image stack size (%%ld) "
-            "isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            expected = "kerns_dim[1]"
-            value = self.imshp[0]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the kernel stack size (%%ld) "
-            "isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_imshp0"] = self.imshp[0]
-        if self.imshp[1] is not None:
-            expected = d["self_imshp1"]
-            value = self.imshp[1]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of rows in the image "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_imshp1"] = self.imshp[1]
-        if self.imshp[2] is not None:
-            expected = d["self_imshp2"]
-            value = self.imshp[2]
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of columns in the image "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_imshp2"] = self.imshp[2]
-        if self.bsize is not None:
-            expected = d["self_bsize"]
-            value = self.bsize
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the batch size (%%ld) "
-            "isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_bsize"] = self.bsize
-        if self.nkern is not None:
-            expected = d["self_nkern"]
-            value = self.nkern
-            d[
-                "assert_size"
-            ] += """
-if(%(value)s != %(expected)s){
-    PyErr_Format(PyExc_ValueError,
-            "The hardcoded shape for the number of kernels in the filter "
-            "(%%ld) isn't the run time shape (%%ld).",
-            (long)%(value)s, (long)%(expected)s);
-    %(fail)s;
-}
-            """ % dict(
-                expected=expected, value=value, **sub
-            )
-            d["self_nkern"] = self.nkern
-        # Other hard coded stuff only if we have all shapes
-        if all_shape:
-            d["self_kshp_logical_r"] = self.kshp_logical[0]
-            d["self_kshp_logical_c"] = self.kshp_logical[1]
-            d["self_kshp_logical_stride_r"] = int(
-                np.ceil(self.kshp_logical[0] / float(self.kshp[0]))
-            )
-            d["self_kshp_logical_stride_c"] = int(
-                np.ceil(self.kshp_logical[1] / float(self.kshp[1]))
-            )
-            d["self_imshp_logical_r"] = self.imshp_logical[1]
-            # numpy.B. 1  not 0
-            d["self_imshp_logical_c"] = self.imshp_logical[2]
-            # numpy.B. 2  not 1
-            d["self_imshp_logical_stride_r"] = int(
-                np.ceil(self.imshp_logical[1] / float(self.imshp[1]))
-            )
-            d["self_imshp_logical_stride_c"] = int(
-                np.ceil(self.imshp_logical[2] / float(self.imshp[2]))
-            )
-            if self.imshp[0] != 1:
-                d["affectation"] = "+="
-            d["all_shape"] = "1"
-            d["dim_zz_const"] = "const"
-            d["dim_zz_affect"] = ""
-        else:
-            d["affectation"] = "+="
-            d["all_shape"] = "0"
-            d["dim_zz_const"] = ""
-            d["dim_zz_affect"] = (
-                """
-  if (mode == FULL) {
-    dim_zz[0] = (int)ceil((dim_im[0]+dim_ker0-1)/float(%(self_dx)s));
-    dim_zz[1] = (int)ceil((dim_im[1]+dim_ker1-1)/float(%(self_dy)s));
-  } else {
-    dim_zz[0] = (int)ceil((dim_im[0]-dim_ker0+1)/float(%(self_dx)s));
-    dim_zz[1] = (int)ceil((dim_im[1]-dim_ker1+1)/float(%(self_dy)s));
-  }
-"""
-                % d
-            )
-            d["assert_size"] += (
-                """
-// Check the stack size of the filter and images are equals
-if(kerns_dim[1] != img2d_dim[1]){
-    PyErr_Format(PyExc_ValueError,
-            "the filter stack size (%%ld) and image stack size (%%ld) differ",
-            (long)kerns_dim[1], (long)img2d_dim[1]);
-    %(fail)s;
-}
-            """
-                % sub
-            )
-        if self.kshp_logical_top_aligned:
-            d["self_kshp_logical_offset_r"] = 0
-            d["self_kshp_logical_offset_c"] = 0
-        elif all_shape:
-            rstride = d["self_kshp_logical_stride_r"]
-            cstride = d["self_kshp_logical_stride_c"]
-            d["self_kshp_logical_offset_r"] = (
-                self.kshp_logical[0] - (self.kshp[0] * rstride) - 1 + rstride
-            ) % rstride
-            d["self_kshp_logical_offset_c"] = (
-                self.kshp_logical[1] - (self.kshp[1] * cstride) - 1 + cstride
-            ) % cstride
-            del rstride, cstride
-        if node.inputs[0].type.dtype == "float32":
-            d["type"] = "float"
-        elif node.inputs[0].type.dtype == "float64":
-            d["type"] = "double"
-        else:
-            raise NotImplementedError(
-                f"Type {node.inputs[0].type.dtype} not implemented"
-            )
-        d["gemm"] = "dgemm_"
-        if d["type"] != "double":
-            d["gemm"] = "sgemm_"
-        if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
-            if self.verbose:
-                _logger.debug(
-                    "return imshp!=imshp_logical or"
-                    " self.kshp != self.kshp_logical shape version"
-                )
-            return _conv_op_code_a % d
-        if self.unroll_patch:
-            if self.verbose:
-                _logger.debug("return unroll patch version. all_shape=%s", all_shape)
-            return _conv_op_code_unroll_patch % d
-        if (self.unroll_batch is not None and self.unroll_batch > 0) or (
-            self.unroll_kern is not None and self.unroll_kern > 0
-        ):
-            assert self.unroll_batch > 0
-            assert self.unroll_kern > 0
-            if self.verbose:
-                _logger.debug(
-                    "return unrolled batch (%s) and kern code (%s)",
-                    str(self.unroll_batch),
-                    str(self.unroll_kern),
-                )
-            return gen_conv_code_unroll_batch_kern(
-                d, self.unroll_batch, self.unroll_kern
-            )
-        # TODO: should we choose the unroll size automatically with the bigger divisor under 5?
-        if self.out_mode == "valid" and self.dx == 0 and self.dy == 0:
-            if self.verbose:
-                _logger.debug("return gemm version")
-            return _conv_op_code_valid_gemm % d
-        else:
-            if self.verbose:
-                _logger.debug("return no gemm version")
-            return _conv_op_code_a % d
-_conv_op_code_a = """
-const int mode=%(mode)s;
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL;
-PyArrayObject *filtersflipped_arr=NULL, *img2d_arr=NULL, *z_arr=NULL;
-const %(type)s fill_value = 0;
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-npy_intp dim_im_phys[2]={%(self_imshp1)s,%(self_imshp2)s};
-npy_intp dim_im_log[2]={%(self_imshp_logical_r)s,%(self_imshp_logical_c)s};
-npy_intp dim_ker_phys[2]={%(self_kshp0)s,%(self_kshp1)s};
-npy_intp dim_ker_log[2]={%(self_kshp_logical_r)s,%(self_kshp_logical_c)s};
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
-if(PyArray_NDIM(%(img2d)s)==2){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==3){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==4){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[3];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[1]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else {
-    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
-    %(fail)s;
-}
-if(PyArray_NDIM(%(filtersflipped)s)==3){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else if(PyArray_NDIM(%(filtersflipped)s)==4){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[3];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[1]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else{
-    std::stringstream temp;
-    temp << "nddim="<<PyArray_NDIM(%(filtersflipped)s);
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("kernel don't have a good shape. " + param).c_str());
-    %(fail)s;
-}
-%(assert_size)s
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, NPY_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((PyArray_STRIDES(img2d_arr)[3] != (npy_intp)sizeof(%(type)s))
-     || (PyArray_STRIDES(img2d_arr)[2] != PyArray_DIMS(img2d_arr)[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    img2d_arr = (PyArrayObject*)img2d;
-    if (!PyArray_ISCONTIGUOUS(img2d_arr)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, NPY_CORDER);
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((PyArray_STRIDES(filtersflipped_arr)[3] != (npy_intp)sizeof(%(type)s))
-     || (PyArray_STRIDES(filtersflipped_arr)[2] != PyArray_DIMS(filtersflipped_arr)[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
-    Py_DECREF(filtersflipped);
-    filtersflipped = contig;
-    filtersflipped_arr = (PyArrayObject*)filtersflipped;
-    if (!PyArray_ISCONTIGUOUS(filtersflipped_arr)){
-        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
-        %(fail)s;
-    }
-}
-if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError,
-                  "invalid mode, only full and valid are supported");
-  %(fail)s;
-}
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {
-  PyErr_SetString(PyExc_ValueError, "Input types must match");
-  %(fail)s;
-}
-if (!img2d)
-{
-    PyErr_SetString(PyExc_AssertionError, "!img2d");
-    %(fail)s;
-}
-if (!filtersflipped)
-{
-    PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
-    %(fail)s;
-}
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
-  ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
-  ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
-  ||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
-  ||!PyArray_ISCONTIGUOUS(%(z)s)
-  )
-{
-  {Py_XDECREF(%(z)s);}
-  npy_intp dims[4] = {0,0,0,0};
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-z_arr = (PyArrayObject*) %(z)s;
-int Os[2];
-Os[0]=%(self_outshp0)s;
-Os[1]=%(self_outshp1)s;
-//assertions
-if (!PyArray_ISCONTIGUOUS(%(z)s))
-{
-    PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
-    %(fail)s;
-}
-for(int b=0;b< %(self_bsize)s;b++){
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
-    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(z_arr,b,n_kern));
-    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
-    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
-      const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d_arr,b,stack_size));
-      const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped_arr,n_kern,stack_size));
-      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-        // Reposition index into input image based on requested output size
-        //row position in logical output image
-        int pos_m = iter_m*%(self_dx)s;
-        //row anchor in logical input image (we will loop upward from here)
-        int new_m;
-        if (mode == FULL) new_m = pos_m ;
-        else new_m = (pos_m+dim_ker_log[0]-1);
-        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-          // current col position in logical output image
-          int pos_n=iter_n*%(self_dy)s;
-          %(type)s sum=0;
-          // Sum over kernel, if index into image is out of bounds
-          // fill with the value
-          // loop over logical rows in kernel
-          for (int j_log=0; j_log < %(self_kshp_logical_r)s; j_log++) {
-            // ind0_log: row position in logical input image
-            int ind0_log = (new_m-j_log);
-            if ((j_log < %(self_kshp_logical_offset_r)s) ||
-                (j_log - %(self_kshp_logical_offset_r)s) MOD %(self_kshp_logical_stride_r)s)
-                continue;
-            if (ind0_log MOD %(self_imshp_logical_stride_r)s)
-                continue;
-            int j_phys = ((j_log- %(self_kshp_logical_offset_r)s) /
-                          %(self_kshp_logical_stride_r)s);
-            int ind0_phys = (ind0_log / %(self_imshp_logical_stride_r)s);
-            //std::cerr <<"j_log" << j_log << " j_phys " << j_phys << " " << ind0_phys << "\\n";
-            if(mode==FULL){
-              //This is a pointer to the current row of the kernel
-              const %(type)s * idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
-              if(ind0_log < 0 || ind0_log >= dim_im_log[0]){
-                   // the current row of the kernel is off the image
-              }else{
-                int k = max((int)(pos_n-dim_im_log[1])+1,0);
-                int max_k=min(pos_n+1,(int)dim_ker_log[1]);
-                const %(type)s * idx_in=&in[ind0_phys*dim_im_phys[1]];
-                for (int ind1_log=pos_n-k; k<max_k; k++,ind1_log--) {
-                    if (1)
-                    {
-                                if ((k < %(self_kshp_logical_offset_c)s) ||
-                                    (k - %(self_kshp_logical_offset_c)s) MOD
-                                    %(self_kshp_logical_stride_c)s)
-                                    continue;
-                                if (ind1_log MOD
-                                    %(self_imshp_logical_stride_c)s)
-                                    continue;
-                    }
-                  sum += idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
-                                   %(self_kshp_logical_stride_c)s] *
-                            idx_in[ind1_log / %(self_imshp_logical_stride_c)s];
-                }
-              }
-            }else{ // mode==VALID
-              //JB: should be dim_im[1] right? (was dim_im[0])
-              const %(type)s* idx_in=&in[ind0_phys*dim_im_phys[1]];
-              const %(type)s* idx_hvals=&hvals[j_phys*dim_ker_phys[1]];
-              int new_n = (pos_n+dim_ker_log[1]-1);
-              if (%(self_imshp_logical_stride_c)s != 1)  // a general loop
-              {
-                  for (int k=0,last=new_n; k < dim_ker_log[1]; k++,last--) {
-                        if ((k < %(self_kshp_logical_offset_c)s) ||
-                            (k - %(self_kshp_logical_offset_c)s) MOD
-                            %(self_kshp_logical_stride_c)s)
-                            continue;
-                        else if (last MOD %(self_imshp_logical_stride_c)s)
-                            continue;
-                            else
-                            {
-                    sum+=idx_hvals[(k-%(self_kshp_logical_offset_c)s) /
-                                   %(self_kshp_logical_stride_c)s] *
-                             idx_in[last/%(self_imshp_logical_stride_c)s];
-                    }
-                  }
-              }
-              else  // self_imshp_stride_c == 1
-              {
-                  int offset = %(self_kshp_logical_offset_c)s;
-                  int k_phys=0;
-                  for (int k_log=offset,last=new_n-offset;
-                       k_log < dim_ker_log[1]; ) {
-                    sum += idx_hvals[k_phys]*idx_in[last];
-                    ++k_phys;
-                    last -= %(self_kshp_logical_stride_c)s;
-                    k_log += %(self_kshp_logical_stride_c)s;
-                  }
-              }
-            }
-          }//for j_log
-          out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-        }//for iter_n
-      }//for iter_m
-    }//for stack_size
-    if (0 && (mode==FULL)){
-      for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
-        std::cout << " " << out[i];
-      std::cout << "\\n";
-    }
-  }//for n_kern
-}//for b
-Py_XDECREF(img2d);
-Py_XDECREF(filtersflipped);
-"""
-#########
-# ConvOp c_code for valid mode (uses gemm)
-#########
-_conv_op_code_valid_gemm = """
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *img2d_arr=NULL, *z_arr=NULL;
-const int NKERN = %(self_nkern)s;
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
-const npy_intp dim_ker0=%(self_kshp0)s;
-const npy_intp dim_ker1=%(self_kshp1)s;
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig;
-if(PyArray_NDIM(%(img2d)s)==2){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==3){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==4){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[3];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[1]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else {
-    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
-    %(fail)s;
-}
-if(PyArray_NDIM(%(filtersflipped)s)==3){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else if(PyArray_NDIM(%(filtersflipped)s)==4){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[3];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[1]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else{
-    std::stringstream temp;
-    temp << "nddim="<<PyArray_NDIM(%(filtersflipped)s);
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("kernel don't have a good shape. " + param).c_str());
-    %(fail)s;
-}
-if (NKERN != kerns_dim[0])
-{
-    PyErr_SetString(PyExc_NotImplementedError, "nonsense nkern");
-    %(fail)s;
-}
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, NPY_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((PyArray_STRIDES(img2d_arr)[3] != (npy_intp)sizeof(%(type)s))
-     || (PyArray_STRIDES(img2d_arr)[2] != PyArray_DIMS(img2d_arr)[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    img2d_arr = (PyArrayObject*)img2d;
-    if (!PyArray_ISCONTIGUOUS(img2d_arr)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-if (!img2d) {
-    PyErr_SetString(PyExc_ValueError, "Null argument img2d");
-    %(fail)s;
-}
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
-  ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
-  ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
-  || (PyArray_DIMS(%(z)s)[3] != dim_zz[1])
-  )
-{
-  {Py_XDECREF(%(z)s);}
-  npy_intp dims[4] = {0,0,0,0};
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-z_arr = (PyArrayObject*) %(z)s;
-%(assert_size)s
-int Os[2];
-Os[0] = dim_im[0]-dim_ker0+1;
-Os[1] = dim_im[1]-dim_ker1+1;
-// allocate a temporary buffer for storing the inner product of each nth kernel row
-// with each row of an image
-{
-%(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* (npy_intp)sizeof(%(type)s));
-int kbufstride = NKERN;
-%(type)s * myfilters = kbuf + Os[0] * NKERN;
-//copy out filtersflipped into filters un-flipped format
-//std::cerr << "__filling myfilters__\\n";
-for(int i=0;i < kerns_dim[0];++i){
-    for(int j=0;j < kerns_dim[1];++j){
-        for(int k=0;k < kerns_dim[2];++k){
-            for(int l=0;l < kerns_dim[3];++l){
-                %(type)s * ff = ((PyArray_NDIM(%(filtersflipped)s)) == 3)
-                    ? (%(type)s *)PyArray_GETPTR3(%(filtersflipped)s, i, kerns_dim[2]-1-k, kerns_dim[3]-1-l)
-                    : (%(type)s *)PyArray_GETPTR4(%(filtersflipped)s, i, j, kerns_dim[2]-1-k, kerns_dim[3]-1-l);
-                myfilters[i * (kerns_dim[1]*kerns_dim[2]*kerns_dim[3])
-                          + j * (kerns_dim[2]*kerns_dim[3])
-                          + k * (kerns_dim[3])
-                          + l] = ff[0];
-                //std::cerr << " " << ff[0];
-            }
-            //std::cerr << "\\n";
-        }
-        //std::cerr << "(end of stack/batch " <<j << "/" << i << "  ) \\n";
-    }
-}
-//std::cerr << "-----new loop ----\\n";
-for(int b=0;b< %(self_bsize)s;b++){
-    for (int img_col = 0; img_col < Os[1]; ++img_col){
-        for (int filter_row = 0; filter_row < kerns_dim[2]; ++filter_row){
-            for (int stackidx = 0; stackidx < %(self_imshp0)s; ++stackidx){
-                %(type)s * img_colview =
-                    (%(type)s *)(PyArray_GETPTR4(img2d, b, stackidx, filter_row, img_col));
-                %(type)s * filter_rows = myfilters + stackidx * (kerns_dim[2]*kerns_dim[3]) +
-                filter_row * kerns_dim[3];
-                //std::cerr << "filterview offset: " << filter_rows - myfilters << "\\n";
-                char N = 'N'; char T = 'T';
-                int Nz0 = Os[0];
-                int Nz1 = NKERN;
-                int K = kerns_dim[3];
-                %(type)s alpha = 1.0;
-                %(type)s beta = stackidx ? 1.0 : 0.0;
-                int imgview_stride = dim_im[1];
-                int filter_rows_stride =kerns_dim[1]*kerns_dim[2]*kerns_dim[3];
-                //remember, Fortran wants a column-major interpretation
-                assert(PyArray_STRIDES(img2d)[3] == (npy_intp)sizeof(%(type)s));
-                if (0){
-                    std::cerr << "b " << b << " img_col " << img_col << " filterrow " << filter_row << " stackidx " <<stackidx << "\\n";
-                    std::cerr << "colview (physical layout) stride: " << imgview_stride << "\\n";
-                    for (int ii = 0; ii < Nz0; ++ii){
-                        for (int jj = 0; jj < K; ++jj){
-                            std::cerr << " " << img_colview[ii * imgview_stride + jj];
-                        }
-                        std::cerr << "\\n";
-                    }
-                    std::cerr << "filterview ("<<filter_row<<"'th rows) stride: " << filter_rows_stride << "\\n";
-                    for (int ii = 0; ii < Nz1; ++ii){
-                        for (int jj = 0; jj < K; ++jj){
-                            std::cerr << " " << filter_rows[ii * filter_rows_stride + jj];
-                        }
-                        std::cerr << "\\n";
-                    }
-                    std::cerr << Nz1 << " " << Nz0 << " " << K << "\\n" ;
-                }
-                %(gemm)s(&T, &N,
-                    &Nz1, &Nz0, &K,
-                    &alpha,
-                    filter_rows, &filter_rows_stride,
-                    img_colview, &imgview_stride,
-                    &beta, kbuf, &kbufstride);
-                if (0){
-                    std::cerr << "z (logical layout) beta" << beta << "\\n";
-                    for (int ii = 0; ii < Nz0; ++ii){
-                        for (int jj = 0; jj < Nz1; ++jj){
-                            std::cerr << " " << kbuf[ii * kbufstride + jj];
-                        }
-                        std::cerr << "\\n";
-                    }
-                }
-            }
-            // now kbuf the sum over the stack, put it into the outbuf
-            for (int img_row = 0; img_row < Os[0]; ++img_row) {
-                for (int kernel_idx = 0; kernel_idx < NKERN; ++kernel_idx) {
-                    %(type)s * z_p =  (%(type)s *)PyArray_GETPTR4(%(z)s, b, kernel_idx, img_row, img_col);
-                    if (0)
-                    {
-                        if (b >= PyArray_DIMS(%(z)s)[0]) %(fail)s;
-                        if (kernel_idx >= PyArray_DIMS(%(z)s)[1]) %(fail)s;
-                        if (img_row >= PyArray_DIMS(%(z)s)[2]) %(fail)s;
-                        if (img_col >= PyArray_DIMS(%(z)s)[3]) %(fail)s;
-                    }
-                    z_p[0] += kbuf[img_row * kbufstride + kernel_idx];
-                }
-            }
-        }
-    }
-}
-free(kbuf);
-}
-Py_XDECREF(img2d);
-"""
-def gen_conv_code_unroll_batch_kern(d, unroll_bsize=1, unroll_ksize=1):
-    """
-    c_code for ConvOp that unroll the batch size loop.
-    """
-    assert unroll_bsize > 0 and unroll_ksize > 0
-    if (
-        "unroll_bsize" in d
-        or "unroll_ksize" in d
-        or "unroll_iter" in d
-        or "unroll_biter" in d
-        or "unroll_kiter" in d
-    ):
-        raise ValueError(
-            "We can't use this dictionary as we will overwrite some of its content"
-        )
-    d = d.copy()
-    d["unroll_bsize"] = unroll_bsize
-    d["unroll_ksize"] = unroll_ksize
-    def my_dup(st, size):
-        s = ""
-        for i in range(size):
-            d["unroll_iter"] = i
-            s += st % d
-        return s + "\n"
-    def my_dup2(st):
-        s = ""
-        iter = 0
-        for i in range(unroll_bsize):
-            d["unroll_biter"] = i
-            for j in range(unroll_ksize):
-                d["unroll_kiter"] = j
-                d["unroll_iter"] = iter
-                iter += 1
-                s += st % d
-        return s + "\n"
-    ret = (
-        """
-const int mode=%(mode)s;
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL, *z_arr=NULL;;
-const %(type)s fill_value = 0;
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
-const npy_intp dim_ker0=%(self_kshp0)s;
-const npy_intp dim_ker1=%(self_kshp1)s;
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
-if(PyArray_NDIM(%(img2d)s)==2){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==3){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==4){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[3];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[1]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else {
-    std::stringstream temp;
-    temp << "nddim="<<PyArray_NDIM(%(img2d)s);
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("img don't have a good shape. " + param).c_str());
-    %(fail)s;
-}
-if(PyArray_NDIM(%(filtersflipped)s)==3){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else if(PyArray_NDIM(%(filtersflipped)s)==4){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[3];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[1]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else{
-    PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
-    %(fail)s;
-}
-%(assert_size)s
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, NPY_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((PyArray_STRIDES(img2d_arr)[3] != (npy_intp)sizeof(%(type)s))
-     || (PyArray_STRIDES(img2d_arr)[2] != PyArray_DIMS(img2d_arr)[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    img2d_arr = (PyArrayObject*)img2d;
-    if (!PyArray_ISCONTIGUOUS(img2d_arr)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, NPY_CORDER);
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((PyArray_STRIDES(filtersflipped_arr)[3] != (npy_intp)sizeof(%(type)s))
-     || (PyArray_STRIDES(filtersflipped_arr)[2] != PyArray_DIMS(filtersflipped_arr)[3]*(npy_intp)sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
-    Py_DECREF(filtersflipped);
-    filtersflipped = contig;
-    filtersflipped_arr = (PyArrayObject*)filtersflipped;
-    if (!PyArray_ISCONTIGUOUS(filtersflipped_arr)){
-        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
-        %(fail)s;
-    }
-}
-if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
-}
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-if (!img2d)
-{
-    PyErr_SetString(PyExc_AssertionError, "!img2d");
-    %(fail)s;
-}
-if (!filtersflipped)
-{
-    PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
-    %(fail)s;
-}
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
-  ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
-  ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
-  ||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
-  ||!PyArray_ISCONTIGUOUS(%(z)s)
-  )
-{
-  {Py_XDECREF(%(z)s);}
-  npy_intp dims[4] = {0,0,0,0};
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-z_arr = (PyArrayObject*) %(z)s;
-int Os[2];
-Os[0]=%(self_outshp0)s;
-Os[1]=%(self_outshp1)s;
-//assertions
-if (!PyArray_ISCONTIGUOUS(%(z)s))
-{
-    PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
-    %(fail)s;
-}
-for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
-"""
-        % d
-    )
-    ret += my_dup2(
-        "%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(z_arr,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));"
-    )
-    ret += my_dup(
-        "for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",
-        unroll_bsize * unroll_ksize,
-    )
-    ret += (
-        """
-    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
-"""
-        % d
-    )
-    ret += my_dup(
-        "const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d_arr,b+%(unroll_iter)s,stack_size));",
-        unroll_bsize,
-    )
-    ret += my_dup(
-        "const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped_arr,n_kern+%(unroll_iter)s,stack_size));",
-        unroll_ksize,
-    )
-    ret += (
-        """
-      int new_m;
-      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-        // Reposition index into input image based on requested output size
-        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-        if (mode == FULL) new_m = pos_m ;
-        else new_m = (pos_m+dim_ker0-1);
-        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-          int pos_n=iter_n*%(self_dy)s;
-        """
-        % d
-    )
-    ret += my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize * unroll_ksize)
-    ret += (
-        """
-          // Sum over kernel, if index into image is out of bounds
-          // fill with the value
-          for (int j=0; j < dim_ker0; j++) {
-            int ind0 = (new_m-j);
-            if(mode==FULL){
-"""
-        % d
-    )
-    ret += my_dup(
-        "const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker1];",
-        unroll_ksize,
-    )
-    ret += (
-        """
-              if(ind0 < 0 || ind0 >= dim_im[0]){
-                if(fill_value!=0)
-                  for (int k=0; k < dim_ker1; k++) {
-"""
-        % d
-    )
-    ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
-    ret += (
-        """
-                  }
-              }else{
-                //do the part where kernel is to the right of the img
-                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
-                if(fill_value!=0){
-                  for(k=0;k<max_k;k++){
-"""
-        % d
-    )
-    ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
-    ret += (
-        """
-                  }
-                }else {k=max_k;}
-                //do the part where the kernel is on the img
-                max_k=min(pos_n+1,(int)dim_ker1);
-"""
-        % d
-    )
-    ret += my_dup(
-        "const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];",
-        unroll_bsize,
-    )
-    ret += (
-        """
-                for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-"""
-        % d
-    )
-    ret += my_dup2(
-        "sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];"
-    )
-    ret += (
-        """
-                }
-                //do the part to the left of the img
-                if(fill_value!=0)
-                  for(;k<dim_ker1;k++){
-"""
-        % d
-    )
-    ret += my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
-    ret += (
-        """
-                  }
-              }
-            }else{//valid mode
-"""
-        % d
-    )
-    ret += my_dup(
-        "const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];",
-        unroll_bsize,
-    )
-    ret += my_dup(
-        "const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker1];",
-        unroll_ksize,
-    )
-    ret += (
-        """
-              int new_n = (pos_n+dim_ker1-1);
-              for (int k=0,last=new_n; k < dim_ker1; k++,last--) {
-"""
-        % d
-    )
-    ret += my_dup2(
-        "sum%(unroll_iter)s+=idx_hvals%(unroll_kiter)s[k]*idx_in%(unroll_biter)s[last];"
-    )
-    ret += (
-        """
-              }
-            }
-          }//for j
-"""
-        % d
-    )
-    ret += my_dup(
-        "out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;",
-        unroll_bsize * unroll_ksize,
-    )
-    ret += """
-        }//for n
-      }//for m
-    }//for stack_size
-  }//for n_kern
-}//for b
-Py_XDECREF(img2d);
-Py_XDECREF(filtersflipped);
-"""
-    return ret
-_conv_op_code_unroll_patch = """
-const int mode=%(mode)s;
-int typenum=0, typenum_f=0;
-PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL, *z_arr=NULL;
-const %(type)s fill_value = 0;//only value of 0 are currently tested and correctly implemented
-int type_im=PyArray_TYPE(%(img2d)s);
-int type_ker=PyArray_TYPE(%(filtersflipped)s);
-const npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
-//The following line caused gcc 4.3.0 20080428 (Red Hat 4.3.0-8) to crash
-//const npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
-// The next line had gcc don't crash.
-const npy_intp dim_ker0=%(self_kshp0)s;
-const npy_intp dim_ker1=%(self_kshp1)s;
-%(dim_zz_const)s npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
-%(dim_zz_affect)s
-PyArray_Dims img2d_shape;
-npy_intp img2d_dim[4]={1,1,0,0};
-img2d_shape.ptr=img2d_dim;
-img2d_shape.len=4;
-PyArray_Dims kerns_shape;
-npy_intp kerns_dim[4]={1,1,0,0};
-kerns_shape.ptr=kerns_dim;
-kerns_shape.len=4;
-PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
-if(PyArray_NDIM(%(img2d)s)==2){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==3){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else if(PyArray_NDIM(%(img2d)s)==4){
-  img2d_dim[3]=PyArray_DIMS(%(img2d)s)[3];
-  img2d_dim[2]=PyArray_DIMS(%(img2d)s)[2];
-  img2d_dim[1]=PyArray_DIMS(%(img2d)s)[1];
-  img2d_dim[0]=PyArray_DIMS(%(img2d)s)[0];
-}else {
-    PyErr_Format(PyExc_ValueError,
-      "image don't have a good number of dimensions %%d. ", PyArray_NDIM(%(filtersflipped)s));
-    %(fail)s;
-}
-if(PyArray_NDIM(%(filtersflipped)s)==3){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else if(PyArray_NDIM(%(filtersflipped)s)==4){
-  kerns_dim[3]=PyArray_DIMS(%(filtersflipped)s)[3];
-  kerns_dim[2]=PyArray_DIMS(%(filtersflipped)s)[2];
-  kerns_dim[1]=PyArray_DIMS(%(filtersflipped)s)[1];
-  kerns_dim[0]=PyArray_DIMS(%(filtersflipped)s)[0];
-}else{
-    PyErr_Format(PyExc_ValueError,
-      "kernel don't have a good number of dimensions %%d. ", PyArray_NDIM(%(filtersflipped)s));
-    %(fail)s;
-}
-%(assert_size)s
-img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, NPY_CORDER);
-img2d_arr = (PyArrayObject*)img2d;
-if ((PyArray_STRIDES(img2d_arr)[3] != sizeof(%(type)s))
-     || (PyArray_STRIDES(img2d_arr)[2] != PyArray_DIMS(img2d_arr)[3]*sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
-    Py_DECREF(img2d);
-    img2d = contig;
-    img2d_arr = (PyArrayObject*)img2d;
-    if (!PyArray_ISCONTIGUOUS(img2d_arr)){
-        PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
-        %(fail)s;
-    }
-}
-filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, NPY_CORDER);
-filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((PyArray_STRIDES(filtersflipped_arr)[3] != sizeof(%(type)s))
-     || (PyArray_STRIDES(filtersflipped_arr)[2] != PyArray_DIMS(filtersflipped_arr)[3]*sizeof(%(type)s))){
-    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
-    Py_DECREF(filtersflipped);
-    filtersflipped = contig;
-    filtersflipped_arr = (PyArrayObject*)filtersflipped;
-    if (!PyArray_ISCONTIGUOUS(filtersflipped_arr)){
-        PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
-        %(fail)s;
-    }
-}
-if(mode != VALID && mode != FULL){
-  PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
-}
-if(dim_zz[0]<=0 || dim_zz[1]<=0){
-PyErr_Format(PyExc_ValueError,
-      "Output dimensions are not valid %%ldx%%ld",(long int)dim_zz[0],(long int)dim_zz[1]);
-      %(fail)s;
-}
-typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
-typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
-if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
-if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-if (!img2d) %(fail)s;
-if (!filtersflipped) %(fail)s;
-if ((!%(z)s)
-  || *PyArray_DIMS(%(z)s)!=4
-  ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
-  ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
-  ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
-  || (PyArray_DIMS(%(z)s)[3] != dim_zz[1])
-  )
-{
-  if (%(z)s) Py_DECREF(%(z)s);
-  npy_intp dims[4] = {0,0,0,0};
-  if(!dims) %(fail)s;
-  dims[0]=%(self_bsize)s;
-  dims[1]=%(self_nkern)s;
-  dims[2]=dim_zz[0];
-  dims[3]=dim_zz[1];
-  %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
-}else{
-  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
-}
-z_arr = (PyArrayObject*) %(z)s;
-// assert the output is C-contiguous
-if (!PyArray_ISCONTIGUOUS(%(z)s))
-{
-    PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
-    %(fail)s;
-}
-//The if on the number of loop make a speed up for small array.
-//with g++ 4.5.1. The compiler should be smart enough to do this himself!
-#pragma omp parallel for schedule(static) if(%(self_bsize)s * %(self_nkern)s > 1)
-// We merge the 2 loop into one to make it easier to parallelize on both
-// This is the equivalent of those 2 lines.
-//for(int b=0;b< %(self_bsize)s;b++){
-// for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
-for(int batch_kern_idx=0;
-    batch_kern_idx < %(self_bsize)s * %(self_nkern)s;
-    batch_kern_idx++){
-    int b = batch_kern_idx / %(self_nkern)s;
-    int n_kern = batch_kern_idx %% %(self_nkern)s;
-    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(z_arr,b,n_kern));
-    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
-    for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
-      const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d_arr,b,stack_size));
-      const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped_arr,n_kern,stack_size));
-      int new_m;
-      for (int iter_m=0; iter_m < dim_zz[0]; iter_m++) {
-        // Reposition index into input image based on requested output size
-        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-        if (mode == FULL) new_m = pos_m ;
-        else new_m = (pos_m+dim_ker0-1);
-        for (int iter_n=0; iter_n < dim_zz[1]; iter_n++) {  // loop over columns
-          int pos_n=iter_n*%(self_dy)s;
-          %(type)s sum=0;
-          %(type)s sum2=0;
-          %(type)s sum3=0;
-          %(type)s sum4=0;
-          int nb_sum=0;
-          // Sum over kernel, if index into image is out of bounds
-          // fill with the value
-          for (int j=0; j < dim_ker0; j++) {
-            int ind0 = (new_m-j);
-            if(mode==FULL){
-              const %(type)s * idx_hvals=&hvals[j*dim_ker1];
-              if(ind0 < 0 || ind0 >= dim_im[0]){
-                if(fill_value!=0)
-                  for (int k=0; k < dim_ker1; k++) {
-                    sum+= idx_hvals[k] * fill_value;
-                  }
-              }else{
-                //do the part where kernel is to the right of the img
-                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
-                if(fill_value!=0){
-                  for(k=0;k<max_k;k++){
-                    sum+= idx_hvals[k]*fill_value;
-                  }
-                }else {k=max_k;}
-                //do the part where the kernel is on the img
-                max_k=min(pos_n+1,(int)dim_ker1);
-                const %(type)s * idx_in=&in[ind0*dim_im[1]];
-                if(iter_n + 4*%(self_dy)s < dim_zz[1]
-                         && iter_n>dim_ker1-1
-                         && iter_n<dim_im[1]-dim_ker1+1-3){
-                  nb_sum=4;
-                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-                    sum+=idx_hvals[k]*idx_in[ind1];
-                    sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
-                    sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s];
-                    sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s];
-                  }
-                }else if(iter_n + 2*%(self_dy)s < dim_zz[1]
-                         && iter_n>dim_ker1-1
-                         && iter_n<dim_im[1]-dim_ker1+1){
-                  nb_sum=2;
-                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-                    sum+=idx_hvals[k]*idx_in[ind1];
-                    sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
-                  }
-                }else{
-                  nb_sum=1;
-                  /*
-                  %(type)s sum_=0;
-                  if((k-max_k) & 0x1 != 0){
-                    sum+= idx_hvals[k] * idx_in[pos_n-k];
-                  }
-                  for (int ind1=pos_n-k; k<max_k; k+=2,ind1-=2) {
-                    sum+= idx_hvals[k] * idx_in[ind1];
-                    sum_+= idx_hvals[k+1] * idx_in[ind1-1];
-                  }
-                  sum+=sum_;
-                  */
-                  for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
-                    sum+=idx_hvals[k]*idx_in[ind1];
-                  }
-                }
-                //do the part to the left of the img
-                if(fill_value!=0)
-                  for(;k<dim_ker1;k++) sum+= idx_hvals[k]*fill_value;
-              }
-            }else{//valid mode
-              const %(type)s* idx_in=&in[ind0*dim_im[1]];
-              const %(type)s* idx_hvals=&hvals[j*dim_ker1];
-              if(iter_n + 4*%(self_dy)s < dim_zz[1]){
-                nb_sum=4;
-                for (int k=dim_ker1-1,im_idx=pos_n; k >=0; k--,im_idx++) {
-                  sum+=idx_hvals[k]*idx_in[im_idx];
-                  sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
-                  sum3+=idx_hvals[k]*idx_in[im_idx+2*%(self_dy)s];
-                  sum4+=idx_hvals[k]*idx_in[im_idx+3*%(self_dy)s];
-                }
-              }else if(iter_n + 2*%(self_dy)s < dim_zz[1]){
-                nb_sum=2;
-                for (int k=dim_ker1-1,im_idx=pos_n; k >=0; k--,im_idx++) {
-                  sum+=idx_hvals[k]*idx_in[im_idx];
-                  sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
-                }
-              }else{
-                nb_sum=1;
-                for (int k=dim_ker1-1,im_idx=pos_n; k >=0; k--,im_idx++) {
-                  sum+=idx_hvals[k]*idx_in[im_idx];
-                }
-              }
-            }//else valid mode
-          }//for j
-          switch(nb_sum){
-          case 4: out[iter_m*dim_zz[1]+iter_n+3] %(affectation)s sum4;
-          case 3: out[iter_m*dim_zz[1]+iter_n+2] %(affectation)s sum3;
-          case 2: out[iter_m*dim_zz[1]+iter_n+1] %(affectation)s sum2;
-          case 1: out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-          }
-          iter_n+=nb_sum-1;
-        }//for iter_n
-      }//for iter_m
-    }//for stack_size
-}//for b and n_kern
-Py_XDECREF(img2d);
-Py_XDECREF(filtersflipped);
-"""
--- a/pytensor/tensor/nnet/conv3d2d.py
+++ b/pytensor/tensor/nnet/conv3d2d.py
-import pytensor
-from pytensor import tensor as at
-from pytensor.gradient import DisconnectedType
-from pytensor.graph.basic import Apply
-from pytensor.graph.op import Op
-from pytensor.graph.rewriting.basic import (
-    WalkingGraphRewriter,
-    copy_stack_trace,
-    node_rewriter,
-)
-def get_diagonal_subtensor_view(x, i0, i1):
-    """
-    Helper function for DiagonalSubtensor and IncDiagonalSubtensor.
-    Notes
-    -----
-    It returns a partial view of x, not a partial copy.
-    """
-    # We have to cast i0 and i0 to int because python
-    # do not support indexing with 0-dim, 'int*' ndarrays.
-    i0 = int(i0)
-    i1 = int(i1)
-    if x.shape[i0] < x.shape[i1]:
-        raise NotImplementedError("is this allowed?")
-    idx = [slice(None)] * x.ndim
-    idx[i0] = slice(x.shape[i1] - 1, None, None)
-    xview = x.__getitem__(tuple(idx))
-    strides = list(xview.strides)
-    if x.shape[i1] != 1:
-        strides[i1] -= strides[i0]
-        xview.strides = strides
-    return xview
-class DiagonalSubtensor(Op):
-    """
-    Return a form a nd diagonal subtensor.
-    Parameters
-    ----------
-    x
-        n-d tensor
-    i0
-        Axis index in x
-    i1
-        Axis index in x
-    Extended summary
-    ----------------
-    ``x`` is some n-dimensional tensor, but this Op only deals with a
-    matrix-shaped slice, using axes i0 and i1. Without loss of
-    generality, suppose that ``i0`` picks out our ``row`` dimension,
-    and i1 the ``column`` dimension.
-    So the relevant part of ``x`` is some matrix ``u``. Suppose it has 7 rows
-    and 4 columns::
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-        [ 0 0 0 0 ]
-    The view returned by this function is also a matrix. It's a thick,
-    diagonal ``stripe`` across u that discards the lower left triangle
-    and the upper right triangle:
-        [ x 0 0 0 ]
-        [ x x 0 0 ]
-        [ x x x 0 ]
-        [ 0 x x x ]
-        [ 0 0 x x ]
-        [ 0 0 0 x ]
-    In this case the return value would be this view of shape 3x4. The
-    returned view has the same number of dimensions as the input
-    ``x``, and the only difference is that the shape along dimension
-    ``i0`` has been reduced by ``shape[i1] - 1`` because of the
-    triangles that got chopped out.
-    The NotImplementedError is meant to catch the case where shape[i0]
-    is too small for the stripe to reach across the matrix, in which
-    case it's not clear what this function should do. Maybe always
-    raise an error. I'd look back to the call site in the Conv3D to
-    see what's necessary at that point.
-    """
-    __props__ = ("inplace",)
-    def __str__(self):
-        if self.inplace:
-            return "%s{inplace}" % self.__class__.__name__
-        return f"{self.__class__.__name__}"
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if inplace:
-            self.view_map = {0: [0]}
-    def make_node(self, x, i0, i1):
-        _i0 = at.as_tensor_variable(i0)
-        _i1 = at.as_tensor_variable(i1)
-        # TODO: We could produce a more precise static shape output type
-        type_shape = (1 if shape == 1 else None for shape in x.type.shape)
-        out_type = at.TensorType(x.type.dtype, shape=type_shape)
-        return Apply(self, [x, _i0, _i1], [out_type()])
-    def perform(self, node, inputs, output_storage):
-        xview = get_diagonal_subtensor_view(*inputs)
-        if self.inplace:
-            output_storage[0][0] = xview
-        else:
-            output_storage[0][0] = xview.copy()
-    def grad(self, inputs, g_outputs):
-        z = at.zeros_like(inputs[0])
-        gx = inc_diagonal_subtensor(z, inputs[1], inputs[2], g_outputs[0])
-        return [gx, DisconnectedType()(), DisconnectedType()()]
-    def connection_pattern(self, node):
-        rval = [[True], [False], [False]]
-        return rval
-diagonal_subtensor = DiagonalSubtensor(False)
-class IncDiagonalSubtensor(Op):
-    """
-    The gradient of DiagonalSubtensor.
-    """
-    __props__ = ("inplace",)
-    def __str__(self):
-        if self.inplace:
-            return "%s{inplace}" % self.__class__.__name__
-        return f"{self.__class__.__name__}"
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, x, i0, i1, amt):
-        _i0 = at.as_tensor_variable(i0)
-        _i1 = at.as_tensor_variable(i1)
-        return Apply(self, [x, _i0, _i1, amt], [x.type()])
-    def perform(self, node, inputs, output_storage):
-        x, i0, i1, amt = inputs
-        if not self.inplace:
-            x = x.copy()
-        xview = get_diagonal_subtensor_view(x, i0, i1)
-        xview += amt
-        output_storage[0][0] = x
-    def grad(self, inputs, g_outputs):
-        x, i0, i1, amt = inputs
-        gy = g_outputs[0]
-        return [
-            gy,
-            DisconnectedType()(),
-            DisconnectedType()(),
-            diagonal_subtensor(gy, i0, i1),
-        ]
-    def connection_pattern(self, node):
-        rval = [[True], [False], [False], [True]]
-        return rval
-inc_diagonal_subtensor = IncDiagonalSubtensor(False)
-def conv3d(
-    signals, filters, signals_shape=None, filters_shape=None, border_mode="valid"
-):
-    """
-    Convolve spatio-temporal filters with a movie.
-    It flips the filters.
-    Parameters
-    ----------
-    signals
-        Timeseries of images whose pixels have color channels.
-        Shape: [Ns, Ts, C, Hs, Ws].
-    filters
-        Spatio-temporal filters.
-        Shape: [Nf, Tf, C, Hf, Wf].
-    signals_shape
-        None or a tuple/list with the shape of signals.
-    filters_shape
-        None or a tuple/list with the shape of filters.
-    border_mode
-        One of 'valid', 'full' or 'half'.
-    Notes
-    -----
-    Another way to define signals: (batch,  time, in channel, row, column)
-    Another way to define filters: (out channel,time,in channel, row, column)
-    See Also
-    --------
-    Someone made a script that shows how to swap the axes between
-    both 3d convolution implementations in PyTensor. See the last
-    `attachment <https://groups.google.com/d/msg/pytensor-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_
-    """
-    if isinstance(border_mode, str):
-        border_mode = (border_mode, border_mode, border_mode)
-    if signals_shape is None:
-        _signals_shape_5d = signals.shape
-    else:
-        _signals_shape_5d = signals_shape
-    if filters_shape is None:
-        _filters_shape_5d = filters.shape
-    else:
-        _filters_shape_5d = filters_shape
-    Ns, Ts, C, Hs, Ws = _signals_shape_5d
-    Nf, Tf, C, Hf, Wf = _filters_shape_5d
-    _signals_shape_4d = (Ns * Ts, C, Hs, Ws)
-    _filters_shape_4d = (Nf * Tf, C, Hf, Wf)
-    if border_mode[1] != border_mode[2]:
-        raise NotImplementedError("height and width bordermodes must match")
-    conv2d_signal_shape = _signals_shape_4d
-    conv2d_filter_shape = _filters_shape_4d
-    if signals_shape is None:
-        conv2d_signal_shape = None
-    if filters_shape is None:
-        conv2d_filter_shape = None
-    out_4d = pytensor.tensor.nnet.conv2d(
-        signals.reshape(_signals_shape_4d),
-        filters.reshape(_filters_shape_4d),
-        input_shape=conv2d_signal_shape,
-        filter_shape=conv2d_filter_shape,
-        border_mode=border_mode[1],
-    )  # ignoring border_mode[2]
-    # compute the intended output size
-    if border_mode[1] == "valid":
-        Hout = Hs - Hf + 1
-        Wout = Ws - Wf + 1
-    elif border_mode[1] == "full":
-        Hout = Hs + Hf - 1
-        Wout = Ws + Wf - 1
-    elif border_mode[1] == "half":
-        Hout = Hs - (Hf % 2) + 1
-        Wout = Ws - (Wf % 2) + 1
-    elif border_mode[1] == "same":
-        raise NotImplementedError()
-    else:
-        raise ValueError("invalid border mode", border_mode[1])
-    # reshape the temporary output to restore its original size
-    out_tmp = out_4d.reshape((Ns, Ts, Nf, Tf, Hout, Wout))
-    # now sum out along the Tf to get the output
-    # but we have to sum on a diagonal through the Tf and Ts submatrix.
-    if Tf == 1:
-        # for Tf==1, no sum along Tf, the Ts-axis of the output is unchanged!
-        out_5d = out_tmp.reshape((Ns, Ts, Nf, Hout, Wout))
-    else:
-        # for some types of convolution, pad out_tmp with zeros
-        if border_mode[0] == "valid":
-            Tpad = 0
-        elif border_mode[0] == "full":
-            Tpad = Tf - 1
-        elif border_mode[0] == "half":
-            Tpad = Tf // 2
-        elif border_mode[0] == "same":
-            raise NotImplementedError()
-        else:
-            raise ValueError("invalid border mode", border_mode[0])
-        if Tpad == 0:
-            out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
-        else:
-            # pad out_tmp with zeros before summing over the diagonal
-            out_tmp_padded = at.zeros(
-                dtype=out_tmp.dtype, shape=(Ns, Ts + 2 * Tpad, Nf, Tf, Hout, Wout)
-            )
-            out_tmp_padded = pytensor.tensor.subtensor.set_subtensor(
-                out_tmp_padded[:, Tpad : (Ts + Tpad), :, :, :, :], out_tmp
-            )
-            out_5d = diagonal_subtensor(out_tmp_padded, 1, 3).sum(axis=3)
-    return out_5d
-@node_rewriter([DiagonalSubtensor, IncDiagonalSubtensor])
-def local_inplace_DiagonalSubtensor(fgraph, node):
-    """Also work for IncDiagonalSubtensor."""
-    if (
-        isinstance(node.op, (DiagonalSubtensor, IncDiagonalSubtensor))
-        and not node.op.inplace
-    ):
-        new_op = node.op.__class__(inplace=True)
-        new_node = new_op(*node.inputs)
-        copy_stack_trace(node.outputs[0], new_node)
-        return [new_node]
-    return False
-pytensor.compile.optdb.register(
-    "local_inplace_DiagonalSubtensor",
-    WalkingGraphRewriter(
-        local_inplace_DiagonalSubtensor,
-        failure_callback=WalkingGraphRewriter.warn_inplace,
-    ),
-    "fast_run",
-    "inplace",
-    position=60,
-)
--- a/pytensor/tensor/nnet/ctc.py
+++ b/pytensor/tensor/nnet/ctc.py
-import os
-import sys
-import pytensor.tensor as at
-from pytensor.configdefaults import config
-from pytensor.gradient import grad_undefined
-from pytensor.graph.basic import Apply
-from pytensor.graph.rewriting.basic import node_rewriter
-from pytensor.link.c.cmodule import GCC_compiler
-from pytensor.link.c.op import ExternalCOp, OpenMPOp
-from pytensor.tensor.blas import batched_dot
-from pytensor.tensor.extra_ops import cpu_contiguous
-from pytensor.tensor.rewriting.basic import register_canonicalize
-from pytensor.tensor.type import ftensor3, fvector
-def _ctc_find_lib():
-    """
-    Find the directory that contains libwarpctc.so
-    """
-    if config.ctc__root != "":
-        for lib_dir in ("build", "lib", "lib64"):
-            lib_path = os.path.join(config.ctc__root, lib_dir)
-            if os.path.isdir(lib_path) and os.path.exists(lib_path):
-                lib_found = os.path.exists(os.path.join(lib_path, "libwarpctc.so"))
-                if lib_found:
-                    return lib_path
-    return None
-def _ctc_check_compile(ctc_lib_path):
-    preamble = """
-#include <string.h>
-#include "ctc.h"
-"""
-    body = """
-ctcOptions options;
-memset(&options, 0, sizeof(ctcOptions));
-options.loc = CTC_CPU;
-options.num_threads = 1;
-"""
-    params = [f"-I{os.path.dirname(__file__)}"]
-    if ctc_lib_path is not None:
-        params.extend([f"-I{os.path.join(config.ctc__root, 'include')}"])
-        params.extend([f"-L{ctc_lib_path}"])
-    params.extend(["-l", "warpctc"])
-    compiler_res = GCC_compiler.try_flags(
-        params, preamble=preamble, body=body, try_run=False, output=True
-    )
-    avail, out, err = (
-        compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
-    )
-    if not avail:
-        return (
-            False,
-            ("cannot compile with warp-ctc. " "We got this error:\n" + str(err)),
-        )
-    return True, None
-def ctc_present():
-    if ctc_present.avail is not None:
-        return ctc_present.avail
-    ctc_lib_path = _ctc_find_lib()
-    ctc_present.path = ctc_lib_path
-    ctc_present.avail, ctc_present.msg = _ctc_check_compile(ctc_present.path)
-    return ctc_present.avail
-ctc_present.avail = None
-ctc_present.msg = None
-ctc_present.path = None
-def ctc_available():
-    if os.name == "nt":
-        ctc_available.msg = ("Windows platforms are currently not supported ",)
-        "by underlying CTC library (warp-ctc)."
-        return False
-    elif not ctc_present():
-        ctc_available.msg = ctc_present.msg
-        return False
-    ctc_available.path = ctc_present.path
-    return True
-ctc_available.msg = None
-ctc_available.path = None
-class ConnectionistTemporalClassification(ExternalCOp, OpenMPOp):
-    """
-    CTC loss function wrapper.
-    Notes
-    -----
-    Using the wrapper requires that Baidu's warp-ctc library is installed.
-    If the warp-ctc library is not on your compiler's default library path,
-    you must set the configuration variable ``config.ctc__root`` appropriately.
-    Parameters
-    ----------
-    compute_grad
-        If set to True, enables the computation of gradients of the CTC loss function.
-    """
-    __props__ = ("compute_grad",)
-    _cop_num_inputs = 3
-    _cop_num_outputs = 2
-    func_file = os.path.join("c_code", "ctc_wrapper.c")
-    func_name = "APPLY_SPECIFIC(ctc_cost_cpu)"
-    def __init__(self, compute_grad=True, openmp=None):
-        if not ctc_available():
-            raise RuntimeError(
-                "Baidu CTC is not available and "
-                "ConnectionistTemporalClassification Op "
-                "can not be constructed."
-            )
-        super().__init__(self.func_file, self.func_name)
-        OpenMPOp.__init__(self, openmp=openmp)
-        self.compute_grad = compute_grad
-        # Return only the cost. Gradient will be returned by grad()
-        self.default_output = 0
-    def c_lib_dirs(self, **kwargs):
-        lib_dirs = []
-        if ctc_available.path is not None:
-            lib_dirs += [ctc_available.path]
-        return lib_dirs
-    def c_compile_args(self, **kwargs):
-        if ctc_available.path is not None:
-            if sys.platform != "darwin" and " " in ctc_available.path:
-                return ['-Wl,-rpath,"' + ctc_available.path + '"']
-            else:
-                return ["-Wl,-rpath," + ctc_available.path]
-        return []
-    def c_libraries(self, **kwargs):
-        return ["warpctc"]
-    def c_header_dirs(self, **kwargs):
-        header_dirs = []
-        if config.ctc__root != "":
-            # We assume here that the header is available at the include directory
-            # of the CTC root directory.
-            header_dirs += [os.path.join(config.ctc__root, "include")]
-        return header_dirs
-    def c_headers(self, **kwargs):
-        return ["ctc.h"] + super().c_headers(**kwargs)
-    def make_node(self, activations, labels, input_lengths):
-        t_activations = at.as_tensor_variable(activations)
-        # Ensure activations array is C-contiguous
-        t_activations = cpu_contiguous(t_activations)
-        t_labels = at.as_tensor_variable(labels)
-        t_input_lengths = at.as_tensor_variable(input_lengths)
-        if t_activations.type.dtype != "float32":
-            raise TypeError("activations must use the float32 type!")
-        if t_activations.ndim != 3:
-            raise ValueError("activations must have 3 dimensions.")
-        if t_labels.type.dtype != "int32":
-            raise TypeError("labels must use the int32 type!")
-        if t_labels.ndim != 2:
-            raise ValueError("labels must have 2 dimensions.")
-        if t_input_lengths.type.dtype != "int32":
-            raise TypeError("input_lengths must use the int32 type!")
-        if t_input_lengths.ndim != 1:
-            raise ValueError("input_lengths must have 1 dimension.")
-        costs = fvector(name="ctc_cost")
-        outputs = [costs]
-        if self.compute_grad:
-            gradients = ftensor3(name="ctc_grad")
-            outputs += [gradients]
-        return Apply(
-            self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs
-        )
-    def L_op(self, inputs, outputs, output_grads):
-        assert self.compute_grad and len(outputs) == 2
-        gradients = outputs[1]
-        assert gradients is not None
-        grad_op = output_grads[0]
-        total_grad = batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(
-            1, 0, 2
-        )
-        return [
-            total_grad,
-            grad_undefined(self, 1, inputs[1]),
-            grad_undefined(self, 2, inputs[2]),
-        ]
-def ctc(activations, labels, input_lengths):
-    """
-    Compute CTC loss function.
-    Notes
-    -----
-    Using the loss function requires that the Baidu's warp-ctc library be installed.
-    If the warp-ctc library is not on the compiler's default library path, the
-    configuration variable ``config.ctc__root`` must be properly set.
-    Parameters
-    ----------
-    activations
-        Three-dimensional tensor, which has a shape of (t, m, p), where
-        t is the time index, m is the minibatch index, and p is the index
-        over the probabilities of each symbol in the alphabet. The memory
-        layout is assumed to be in C-order, which consists in the slowest
-        to the fastest changing dimension, from left to right. In this case,
-        p is the fastest changing dimension.
-    labels
-        A 2-D tensor of all the labels for the minibatch. In each row, there
-        is a sequence of target labels. Negative values are assumed to be padding,
-        and thus are ignored. Blank symbol is assumed to have index 0 in the
-        alphabet.
-    input_lengths
-        A 1-D tensor with the number of time steps for each sequence in
-        the minibatch.
-    Returns
-    -------
-    1-D array
-        Cost of each example in the minibatch.
-    """
-    return ConnectionistTemporalClassification()(activations, labels, input_lengths)
-# Disable gradient computation if not needed
-@register_canonicalize("fast_compile")
-@node_rewriter([ConnectionistTemporalClassification])
-def local_ctc_no_grad(fgraph, node):
-    if isinstance(node.op, ConnectionistTemporalClassification):
-        if len(node.outputs) > 1:
-            if len(fgraph.clients[node.outputs[1]]) == 0:  # gradient is not used
-                return [
-                    ConnectionistTemporalClassification(compute_grad=False)(
-                        *node.inputs
-                    ),
-                    None,
-                ]
-    return False
--- a/pytensor/tensor/nnet/neighbours.py
+++ b/pytensor/tensor/nnet/neighbours.py
-"""
-TODO: implement Images2Neibs.infer_shape() methods
-"""
-import numpy as np
-import pytensor
-from pytensor.gradient import grad_not_implemented, grad_undefined
-from pytensor.graph.basic import Apply
-from pytensor.link.c.op import COp
-from pytensor.link.c.type import EnumList
-from pytensor.tensor.basic import arange, as_tensor_variable, concatenate, stack, zeros
-from pytensor.tensor.math import ceil_intdiv
-from pytensor.tensor.subtensor import inc_subtensor, set_subtensor
-from pytensor.tensor.type import matrix
-class Images2Neibs(COp):
-    """
-    Reshapes the input as a 2D tensor where each row is an pooling
-    example.
-    Parameters
-    ----------
-    mode : {'valid', 'ignore_borders', 'wrap_centered'}
-        - 'valid' :
-            Requires an input that is a multiple of the pooling factor
-            (in each direction).
-        - 'half' :
-            Equivalent to 'valid' if we pre-pad with zeros the input on
-            each side by (neib_shape[0]//2, neib_shape[1]//2)
-        - 'full' :
-            Equivalent to 'valid' if we pre-pad with zeros the input on
-            each side by (neib_shape[0] - 1, neib_shape[1] - 1)
-        - 'ignore_borders' :
-            Same as valid, but will ignore the borders if the shape(s)
-            of the input is not a multiple of the pooling factor(s).
-        - 'wrap_centered' :
-            ?? TODO comment
-    """
-    __props__ = ("mode",)
-    BORDER_MODE = EnumList(
-        ("MODE_VALID", "valid"),
-        ("MODE_HALF", "half"),
-        ("MODE_FULL", "full"),
-        ("MODE_WRAP_CENTERED", "wrap_centered"),
-        ("MODE_IGNORE_BORDERS", "ignore_borders"),
-    )
-    params_type = BORDER_MODE
-    def get_params(self, node):
-        return self.mode
-    def __init__(self, mode="valid"):
-        implemented_modes = self.BORDER_MODE.get_aliases()
-        if mode not in implemented_modes:
-            raise NotImplementedError(
-                f"Only modes {', '.join(implemented_modes)} have been implemented for {type(self).__name__}"
-            )
-        self.mode = mode
-    def __str__(self):
-        return self.__class__.__name__ + "{%s}" % self.mode
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "mode"):
-            self.mode = "valid"
-    def make_node(self, ten4, neib_shape, neib_step=None):
-        """
-        Parameters
-        ----------
-        ten4 : a list of lists of images
-            ten4 is of shape (list 1 dim, list 2 dim, row, col).
-        neib_shape
-            (r,c) where r is the height of the neighborhood in rows and c is
-            the width of the neighborhood in columns.
-        neib_step
-            (dr,dc) where dr is the number of rows to skip between patch and dc
-            is the number of columns. When None, this is the same as neib_shape
-            (patch are disjoint).
-        Returns
-        -------
-        matrix
-            A 2D matrix, written using the following pattern::
-                idx = 0
-                for i in range(list 1 dim)
-                    for j in range(list 2 dim)
-                        for k in <image column coordinates>
-                            for l in <image row coordinates>
-                                output[idx,:]
-                                     = flattened version of ten4[i,j,l:l+r,k:k+c]
-                                idx += 1
-            .. note:: The op isn't necessarily implemented internally with these
-                for loops, they're just the easiest way to describe the output
-                pattern.
-        """
-        ten4 = as_tensor_variable(ten4)
-        neib_shape = as_tensor_variable(neib_shape)
-        if neib_step is None:
-            neib_step = neib_shape
-        else:
-            neib_step = as_tensor_variable(neib_step)
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_step.ndim == 1
-        return Apply(
-            self, [ten4, neib_shape, neib_step], [matrix(dtype=ten4.type.dtype)]
-        )
-    def grad(self, inp, grads):
-        x, neib_shape, neib_step = inp
-        (gz,) = grads
-        if self.mode in ("valid", "ignore_borders"):
-            if (
-                neib_shape is neib_step
-                or neib_shape == neib_step
-                or
-                # PyTensor Constant == do not compare the data
-                # the equals function do that.
-                (hasattr(neib_shape, "equals") and neib_shape.equals(neib_step))
-            ):
-                return [
-                    neibs2images(gz, neib_shape, x.shape, mode=self.mode),
-                    grad_undefined(self, 1, neib_shape),
-                    grad_undefined(self, 2, neib_step),
-                ]
-        if self.mode in ["valid"]:
-            # Iterate over neighborhood positions, summing contributions.
-            def pos2map(pidx, pgz, prior_result, neib_shape, neib_step):
-                """
-                Helper function that adds gradient contribution from a single
-                neighborhood position i,j.
-                pidx = Index of position within neighborhood.
-                pgz  = Gradient of shape (batch_size*num_channels*neibs)
-                prior_result  = Shape (batch_size, num_channnels, rows, cols)
-                neib_shape = Number of rows, cols in a neighborhood.
-                neib_step  = Step sizes from image2neibs.
-                """
-                nrows, ncols = neib_shape
-                rstep, cstep = neib_step
-                batch_size, num_channels, rows, cols = prior_result.shape
-                i = pidx // ncols
-                j = pidx - (i * ncols)
-                # This position does not touch some img pixels in valid mode.
-                result_indices = prior_result[
-                    :,
-                    :,
-                    i : (rows - nrows + i + 1) : rstep,
-                    j : (cols - ncols + j + 1) : cstep,
-                ]
-                newshape = (
-                    (batch_size, num_channels)
-                    + ((rows - nrows) // rstep + 1,)
-                    + ((cols - ncols) // cstep + 1,)
-                )
-                return inc_subtensor(result_indices, pgz.reshape(newshape))
-            indices = arange(neib_shape[0] * neib_shape[1])
-            pgzs = gz.dimshuffle((1, 0))
-            result, _ = pytensor.scan(
-                fn=pos2map,
-                sequences=[indices, pgzs],
-                outputs_info=zeros(x.shape),
-                non_sequences=[neib_shape, neib_step],
-            )
-            grad_input = result[-1]
-            return [
-                grad_input,
-                grad_undefined(self, 1, neib_shape),
-                grad_undefined(self, 2, neib_step),
-            ]
-        return [
-            grad_not_implemented(self, 0, x),
-            grad_undefined(self, 1, neib_shape),
-            grad_undefined(self, 2, neib_step),
-        ]
-    def c_code_cache_version(self):
-        return (10,)
-    def perform(self, node, inp, out_, params):
-        ten4, neib_shape, neib_step = inp
-        (z,) = out_
-        # XXX: GpuImages2Neibs should not run this perform in DebugMode
-        if not isinstance(self, Images2Neibs):
-            raise pytensor.graph.utils.MethodNotDefined()
-        def CEIL_INTDIV(a, b):
-            if a % b:
-                return (a // b) + 1
-            else:
-                return a // b
-        grid_c = -1  # number of patch in height
-        grid_d = -1  # number of patch in width
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_shape.shape[0] == 2
-        assert neib_step.ndim == 1
-        assert neib_step.shape[0] == 2
-        c, d = neib_shape
-        step_x, step_y = neib_step
-        mode = self.mode
-        if step_x <= 0 or step_y <= 0:
-            raise ValueError(
-                "neib_step wrong step ; values <= 0. Got " + str(neib_step)
-            )
-        if c <= 0 or d <= 0:
-            raise ValueError("neib_shape values <=0. Got " + str(neib_shape))
-        if mode == "wrap_centered":
-            if (c % 2 != 1) or (d % 2 != 1):
-                raise TypeError(
-                    "Images2Neibs: in mode wrap_centered need patch with odd shapes"
-                )
-            if (ten4.shape[2] < c) or (ten4.shape[3] < d):
-                raise TypeError(
-                    "Images2Neibs: in wrap_centered mode, don't support"
-                    " image shapes smaller then the patch shapes:"
-                    f" neib_shape=({int(c)},{int(d)}), ten4[2:]=[{int(ten4.shape[2])},{int(ten4.shape[3])}]"
-                )
-            grid_c = CEIL_INTDIV(ten4.shape[2], step_x)
-            grid_d = CEIL_INTDIV(ten4.shape[3], step_y)
-        elif mode == "valid":
-            if (ten4.shape[2] < c) or (((ten4.shape[2] - c) % step_x) != 0):
-                raise TypeError(
-                    f"neib_shape[0]={int(c)}, neib_step[0]={int(step_x)} and"
-                    f" ten4.shape[2]={int(ten4.shape[2])} not consistent"
-                )
-            if (ten4.shape[3] < d) or (((ten4.shape[3] - d) % step_y) != 0):
-                raise TypeError(
-                    f"neib_shape[1]={int(d)}, neib_step[1]={int(step_y)} and"
-                    f" ten4.shape[3]={int(ten4.shape[3])} not consistent"
-                )
-            # number of patch in height
-            grid_c = 1 + ((ten4.shape[2] - c) // step_x)
-            # number of patch in width
-            grid_d = 1 + ((ten4.shape[3] - d) // step_y)
-        elif mode == "ignore_borders":
-            # number of patch in height
-            grid_c = 1 + ((ten4.shape[2] - c) // step_x)
-            # number of patch in width
-            grid_d = 1 + ((ten4.shape[3] - d) // step_y)
-        elif mode == "half":
-            # This is equivalent to 'valid' with padding (c // 2, d // 2) on both sides
-            # Thus the expanded image will have size (h + 2 * (c // 2), w + 2 * (d // 2))
-            # Plugging these in the equation for 'valid' we get
-            # h + 2 * (c // 2) - c  = h - (c % 2)
-            # w + 2 * (d // 2) - c  = w - (d % 2)
-            if (ten4.shape[2] < c) or (((ten4.shape[2] - (c % 2)) % step_x) != 0):
-                raise TypeError(
-                    f"neib_shape[0]={int(c)}, neib_step[0]={int(step_x)} and"
-                    f" ten4.shape[2]={int(ten4.shape[2])} not consistent"
-                )
-            if (ten4.shape[3] < d) or (((ten4.shape[3] - (d % 2)) % step_y) != 0):
-                raise TypeError(
-                    f"neib_shape[0]={int(d)}, neib_step[0]={int(step_y)} and"
-                    f" ten4.shape[3]={int(ten4.shape[3])} not consistent"
-                )
-            # number of patch in height
-            grid_c = 1 + ((ten4.shape[2] - (c % 2)) // step_x)
-            # number of patch in width
-            grid_d = 1 + ((ten4.shape[3] - (d % 2)) // step_y)
-        elif mode == "full":
-            # This is equivalent to 'valid' with padding (c - 1, d - 1) on both sides
-            # Thus the expanded image will have size (h + 2 * (c - 1), w + 2 * (d - 1))
-            # Plugging these in the equation for 'valid' we get
-            # h + 2 * (c - 1) - c  = h + c - 2
-            # w + 2 * (d - 1) - c  = w + d - 2
-            if (ten4.shape[2] < c) or (((ten4.shape[2] + c - 2) % step_x) != 0):
-                raise TypeError(
-                    f"neib_shape[0]={int(c)}, neib_step[0]={int(step_x)} and"
-                    f" ten4.shape[2]={int(ten4.shape[2])} not consistent"
-                )
-            if (ten4.shape[3] < d) or (((ten4.shape[3] + d - 2) % step_y) != 0):
-                raise TypeError(
-                    f"neib_shape[0]={int(d)}, neib_step[0]={int(step_y)} and"
-                    f" ten4.shape[3]={int(ten4.shape[3])} not consistent"
-                )
-            # number of patch in height
-            grid_c = 1 + ((ten4.shape[2] + c - 2) // step_x)
-            # number of patch in width
-            grid_d = 1 + ((ten4.shape[3] + d - 2) // step_y)
-        else:
-            raise TypeError(f"Images2Neibs: unknown mode '{mode}'")
-        z_dim0 = grid_c * grid_d * ten4.shape[1] * ten4.shape[0]
-        z_dim1 = c * d
-        z[0] = np.empty((z_dim0, z_dim1), dtype=node.outputs[0].dtype)
-        nb_batch = ten4.shape[0]
-        nb_stack = ten4.shape[1]
-        height = ten4.shape[2]
-        width = ten4.shape[3]
-        wrap_centered_half_idx_shift_x = c // 2
-        wrap_centered_half_idx_shift_y = d // 2
-        for n in range(nb_batch):
-            for s in range(nb_stack):
-                # loop over the number of patch in height
-                for a in range(grid_c):
-                    # loop over the number of patch in width
-                    for b in range(grid_d):
-                        z_row = b + grid_d * (a + grid_c * (s + nb_stack * n))
-                        for i in range(c):
-                            ten4_2 = i + a * step_x
-                            if mode == "wrap_centered":
-                                ten4_2 -= wrap_centered_half_idx_shift_x
-                                if ten4_2 < 0:
-                                    ten4_2 += height
-                                elif ten4_2 >= height:
-                                    ten4_2 -= height
-                            elif mode == "half":
-                                ten4_2 -= wrap_centered_half_idx_shift_x
-                            elif mode == "full":
-                                ten4_2 -= c - 1
-                            if ten4_2 < 0 or ten4_2 >= height:
-                                z[0][z_row, d * i : d * i + d] = 0
-                            else:
-                                for j in range(d):
-                                    ten4_3 = j + b * step_y
-                                    if mode == "wrap_centered":
-                                        ten4_3 -= wrap_centered_half_idx_shift_y
-                                        if ten4_3 < 0:
-                                            ten4_3 += width
-                                        elif ten4_3 >= width:
-                                            ten4_3 -= width
-                                    elif mode == "half":
-                                        ten4_3 -= wrap_centered_half_idx_shift_y
-                                    elif mode == "full":
-                                        ten4_3 -= d - 1
-                                    z_col = j + d * i
-                                    if ten4_3 < 0 or ten4_3 >= width:
-                                        z[0][z_row, z_col] = 0
-                                    else:
-                                        z[0][z_row, z_col] = ten4[n, s, ten4_2, ten4_3]
-    def infer_shape(self, fgraph, node, input_shape):
-        in_shape = input_shape[0]
-        c, d = node.inputs[1]
-        step_x, step_y = node.inputs[2]
-        if self.mode == "wrap_centered":
-            grid_c = ceil_intdiv(in_shape[2], step_x)
-            grid_d = ceil_intdiv(in_shape[3], step_y)
-        elif self.mode == "valid":
-            grid_c = 1 + ((in_shape[2] - c) // step_x)
-            grid_d = 1 + ((in_shape[3] - d) // step_y)
-        elif self.mode == "ignore_borders":
-            grid_c = 1 + ((in_shape[2] - c) // step_x)
-            grid_d = 1 + ((in_shape[3] - d) // step_y)
-        elif self.mode == "half":
-            grid_c = 1 + ((in_shape[2] - (c % 2)) // step_x)
-            grid_d = 1 + ((in_shape[3] - (d % 2)) // step_y)
-        elif self.mode == "full":
-            grid_c = 1 + ((in_shape[2] + c - 2) // step_x)
-            grid_d = 1 + ((in_shape[3] + d - 2) // step_y)
-        else:
-            raise TypeError(f"Images2Neibs: unknown mode '{self.mode}'")
-        z_dim0 = grid_c * grid_d * in_shape[1] * in_shape[0]
-        z_dim1 = c * d
-        return [(z_dim0, z_dim1)]
-    def c_code(self, node, name, inp, out, sub):
-        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-        int grid_c = -1; //number of patch in height
-        int grid_d = -1; //number of patch in width
-        {
-        if (PyArray_NDIM(%(ten4)s) != 4)
-        {
-            PyErr_Format(PyExc_TypeError, "ten4 wrong rank");
-            %(fail)s;
-        }
-        if (PyArray_NDIM(%(neib_shape)s) != 1)
-        {
-            PyErr_Format(PyExc_TypeError, "neib_shape wrong rank");
-            %(fail)s;
-        }
-        if ( (PyArray_DIMS(%(neib_shape)s))[0] != 2)
-        {
-            PyErr_Format(PyExc_TypeError, "neib_shape wrong shape ; has to"
-                                          " contain 2 elements");
-            %(fail)s;
-        }
-        if (PyArray_NDIM(%(neib_step)s) != 1)
-        {
-            PyErr_Format(PyExc_TypeError, "neib_step wrong rank");
-            %(fail)s;
-        }
-        if ( (PyArray_DIMS(%(neib_step)s))[0] != 2)
-        {
-            PyErr_Format(PyExc_TypeError,
-                         "neib_step wrong step ; has to contain 2 elements");
-            %(fail)s;
-        }
-        // (c,d) = neib_shape
-        const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
-        const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-        // (step_x,step_y) = neib_step
-        const dtype_%(neib_step)s step_x = *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
-        const dtype_%(neib_step)s step_y = *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
-        if (step_x <=0 || step_y <=0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "neib_step wrong step ; values <= 0. Got %%lld %%lld.",
-                         (long long) step_x, (long long) step_y);
-            %(fail)s;
-        }
-        if (c <=0 || d <=0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "neib_shape values <= 0. Got %%lld %%lld.",
-                         (long long)c, (long long)d);
-            %(fail)s;
-        }
-        if (%(mode)s == MODE_WRAP_CENTERED) {
-            if (c%%2!=1 || d%%2!=1){
-                PyErr_Format(PyExc_TypeError,
-                             "Images2Neibs: in mode wrap_centered"
-                             " need patch with odd shapes");
-                %(fail)s;
-            }
-            if ( (PyArray_DIMS(%(ten4)s))[2] < c ||
-                 (PyArray_DIMS(%(ten4)s))[3] < d)
-            {
-                PyErr_Format(PyExc_TypeError,
-                    "Images2Neibs: in wrap_centered mode, don't support image"
-                    " shapes smaller then the patch shapes:"
-                    " neib_shape=(%%ld,%%ld), ten4[2:]=[%%ld,%%ld]",
-                    (long int)c, (long int)d,
-                    (long int)(PyArray_DIMS(%(ten4)s)[2]),
-                    (long int)(PyArray_DIMS(%(ten4)s)[3]));
-                %(fail)s;
-            }
-            grid_c = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[2]),step_x);
-            grid_d = CEIL_INTDIV(((PyArray_DIMS(%(ten4)s))[3]),step_y);
-        } else if (%(mode)s == MODE_VALID) {
-            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
-                             " ten4.shape[2]=%%ld not consistent",
-                             (long int)c, (long int)step_x,
-                             (long int)(PyArray_DIMS(%(ten4)s)[2]));
-                %(fail)s;
-            }
-            if ( ((PyArray_DIMS(%(ten4)s))[3] < d) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
-                             " ten4.shape[3]=%%ld not consistent",
-                             (long int)d, (long int)step_y,
-                             (long int)(PyArray_DIMS(%(ten4)s)[3]));
-                %(fail)s;
-            }
-            //number of patch in height
-            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
-            //number of patch in width
-            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        } else if (%(mode)s == MODE_IGNORE_BORDERS) {
-            //number of patch in height
-            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-c)/step_x);
-            //number of patch in width
-            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-d)/step_y);
-        } else if (%(mode)s == MODE_HALF) {
-            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
-                             " ten4.shape[2]=%%ld not consistent",
-                             (long int)c, (long int)step_x,
-                             (long int)(PyArray_DIMS(%(ten4)s)[2]));
-                %(fail)s;
-            }
-            if ( ((PyArray_DIMS(%(ten4)s))[3] < d) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
-                             " ten4.shape[3]=%%ld not consistent",
-                             (long int)d, (long int)step_y,
-                             (long int)(PyArray_DIMS(%(ten4)s)[3]));
-                %(fail)s;
-            }
-            //number of patch in height
-            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
-            //number of patch in width
-            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
-        } else if (%(mode)s == MODE_FULL) {
-            if ( ((PyArray_DIMS(%(ten4)s))[2] < c) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
-                             " ten4.shape[2]=%%ld not consistent",
-                             (long int)c, (long int)step_x,
-                             (long int)(PyArray_DIMS(%(ten4)s)[2]));
-                %(fail)s;
-            }
-            if ( ((PyArray_DIMS(%(ten4)s))[3] < d) ||
-                 ( (((PyArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0))
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
-                             " ten4.shape[3]=%%ld not consistent",
-                             (long int)d, (long int)step_y,
-                             (long int)(PyArray_DIMS(%(ten4)s)[3]));
-                %(fail)s;
-            }
-            //number of patch in height
-            grid_c = 1+(((PyArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
-            //number of patch in width
-            grid_d = 1+(((PyArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
-        } else {
-            PyErr_Format(PyExc_TypeError,
-                         "Images2Neibs: unknown mode %%d", %(mode)s);
-            %(fail)s;
-        }
-        // new dimensions for z
-        const npy_intp z_dim1 = c * d;
-        const npy_intp z_dim0 =  grid_c
-                            * grid_d
-                            * (PyArray_DIMS(%(ten4)s))[1]
-                            * (PyArray_DIMS(%(ten4)s))[0];
-        if ((NULL == %(z)s)
-            || ((PyArray_DIMS(%(z)s))[0] != z_dim0 )
-            || ((PyArray_DIMS(%(z)s))[1] != z_dim1 )
-        )
-        {
-            Py_XDECREF(%(z)s);
-            npy_intp dims[2];
-            dims[0] = z_dim0;
-            dims[1] = z_dim1;
-            %(z)s = (PyArrayObject*) PyArray_EMPTY(2,
-                dims,
-                PyArray_TYPE((PyArrayObject*) py_%(ten4)s),
-                0);
-            if (!%(z)s)
-            {
-                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
-                %(fail)s;
-            }
-        }
-        }
-        { // NESTED SCOPE
-        const int nb_batch = (PyArray_DIMS(%(ten4)s))[0];
-        const int nb_stack = (PyArray_DIMS(%(ten4)s))[1];
-        const int height = (PyArray_DIMS(%(ten4)s))[2];
-        const int width = (PyArray_DIMS(%(ten4)s))[3];
-        // (c,d) = neib_shape
-        const npy_intp c = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
-        const npy_intp d = (npy_intp) *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-        // (step_x,step_y) = neib_step
-        const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
-        const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
-        const int wrap_centered_half_idx_shift_x = c/2;
-        const int wrap_centered_half_idx_shift_y = d/2;
-        // Oh this is messed up...
-        for (int n = 0; n < nb_batch; n++)              // loop over batches
-            for (int s = 0; s < nb_stack; s++)          // loop over stacks
-                for (int a = 0; a < grid_c; a++)        // loop over the number of patch in height
-                    for (int b = 0; b < grid_d; b++)    // loop over the number of patch in width
-                    {
-                        int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                        for (int i = 0; i < c; i++)     // loop over c
-                        {
-                            int ten4_2 = i + a * step_x;
-                            if (%(mode)s == MODE_WRAP_CENTERED) {
-                                ten4_2 -= wrap_centered_half_idx_shift_x;
-                                if ( ten4_2 < 0 ) ten4_2 += height;
-                                else if (ten4_2 >= height) ten4_2 -= height;
-                            } else if (%(mode)s == MODE_HALF) {
-                                ten4_2 -= wrap_centered_half_idx_shift_x;
-                            } else if (%(mode)s == MODE_FULL) {
-                                ten4_2 -= c - 1;
-                            }
-                            if (ten4_2 < 0 | ten4_2 >= height) {
-                                dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, d * i);
-                                memset(curr_z, 0, d*sizeof(*curr_z));
-                            } else {
-                                for (int j = 0; j < d; j++)  // loop over d
-                                {
-                                    int ten4_3 = j + b * step_y;
-                                    if (%(mode)s == MODE_WRAP_CENTERED) {
-                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                        if ( ten4_3 < 0 ) ten4_3 += width;
-                                        else if (ten4_3 >= width) ten4_3 -= width;
-                                    } else if (%(mode)s == MODE_HALF) {
-                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if (%(mode)s == MODE_FULL) {
-                                        ten4_3 -= d - 1;
-                                    }
-                                    int z_col = j + d * i;
-                                    dtype_%(z)s* curr_z = (dtype_%(z)s*) PyArray_GETPTR2(%(z)s, z_row, z_col);
-                                    if (ten4_3 < 0 | ten4_3 >= width) {
-                                        *curr_z = 0;
-                                    } else {
-                                        *curr_z = *( (dtype_%(ten4)s*) PyArray_GETPTR4(%(ten4)s, n, s, ten4_2, ten4_3));
-                                    }
-                                }
-                            }
-                        }
-                    }
-        } // END NESTED SCOPE
-        """ % dict(
-            ten4=inp[0],
-            neib_shape=inp[1],
-            neib_step=inp[2],
-            z=out[0],
-            fail=sub["fail"],
-            mode=sub["params"],
-        )
-def images2neibs(ten4, neib_shape, neib_step=None, mode="valid"):
-    r"""
-    Function :func:`images2neibs <pytensor.tensor.nnet.neighbours.images2neibs>`
-    allows to apply a sliding window operation to a tensor containing
-    images or other two-dimensional objects.
-    The sliding window operation loops over points in input data and stores
-    a rectangular neighbourhood of each point.
-    It is possible to assign a step of selecting patches (parameter `neib_step`).
-    Parameters
-    ----------
-    ten4 : A 4d tensor-like
-        A 4-dimensional tensor which represents a list of lists of images.
-        It should have shape (list 1 dim, list 2 dim, row, col). The first
-        two dimensions can be useful to store different channels and batches.
-    neib_shape : A 1d tensor-like of 2 values
-        A tuple containing two values: height and width of the neighbourhood.
-        It should have shape (r,c) where r is the height of the neighborhood
-        in rows and c is the width of the neighborhood in columns.
-    neib_step : A 1d tensor-like of 2 values
-        (dr,dc) where dr is the number of rows to skip between patch and dc is
-        the number of columns. The parameter should be a tuple of two elements:
-        number of rows and number of columns to skip each iteration.
-        Basically, when the step is 1, the neighbourhood of every first element
-        is taken and every possible rectangular subset is returned.
-        By default it is equal to `neib_shape` in other words, the patches are
-        disjoint. When the step is greater than `neib_shape`, some elements are
-        omitted. When None, this is the same as neib_shape (patch are disjoint).
-    mode : {'valid', 'ignore_borders', 'wrap_centered', 'half'}
-        ``valid``
-            Requires an input that is a multiple of the
-            pooling factor (in each direction).
-        ``half``
-            Equivalent to 'valid' if we pre-pad with zeros the input on
-            each side by (neib_shape[0]//2, neib_shape[1]//2)
-        ``full``
-            Equivalent to 'valid' if we pre-pad with zeros the input on
-            each side by (neib_shape[0] - 1, neib_shape[1] - 1)
-        ``ignore_borders``
-            Same as valid, but will ignore the borders if the shape(s) of
-            the input is not a multiple of the pooling factor(s).
-        ``wrap_centered``
-            ?? TODO comment
-    Returns
-    -------
-    object
-        Reshapes the input as a 2D tensor where each row is an
-        pooling example. Pseudo-code of the output:
-          .. code-block:: python
-             idx = 0
-             for i in range(list 1 dim):
-                 for j in range(list 2 dim):
-                     for k in <image column coordinates>:
-                         for l in <image row coordinates>:
-                             output[idx,:]
-                                  = flattened version of ten4[i,j,l:l+r,k:k+c]
-                             idx += 1
-          .. note:: The operation isn't necessarily implemented internally with
-             these for loops, they're just the easiest way to describe the
-             output pattern.
-    Notes
-    -----
-    .. note::
-        Currently the step size should be chosen in the way that the
-        corresponding dimension :math:`i` (width or height) is equal
-        to :math:`n * step\_size_i + neib\_shape_i` for some :math:`n`.
-    Examples
-    --------
-    .. code-block:: python
-        # Defining variables
-        images = pytensor.tensor.type.tensor4('images')
-        neibs = images2neibs(images, neib_shape=(5, 5))
-        # Constructing pytensor function
-        window_function = pytensor.function([images], neibs)
-        # Input tensor (one image 10x10)
-        im_val = np.arange(100.).reshape((1, 1, 10, 10))
-        # Function application
-        neibs_val = window_function(im_val)
-    .. note:: The underlying code will construct a 2D tensor of disjoint
-       patches 5x5. The output has shape 4x25.
-    """
-    return Images2Neibs(mode)(ten4, neib_shape, neib_step)
-def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
-    """
-    Function :func:`neibs2images <pytensor.sandbox.neighbours.neibs2images>`
-    performs the inverse operation of
-    :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`. It inputs
-    the output of :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`
-    and reconstructs its input.
-    Parameters
-    ----------
-    neibs : 2d tensor
-        Like the one obtained by
-        :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`.
-    neib_shape
-        `neib_shape` that was used in
-        :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`.
-    original_shape
-        Original shape of the 4d tensor given to
-        :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`
-    Returns
-    -------
-    object
-        Reconstructs the input of
-        :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`,
-        a 4d tensor of shape `original_shape`.
-    Notes
-    -----
-    Currently, the function doesn't support tensors created with
-    `neib_step` different from default value. This means that it may be
-    impossible to compute the gradient of a variable gained by
-    :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>` w.r.t.
-    its inputs in this case, because it uses
-    :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>` for
-    gradient computation.
-    Examples
-    --------
-    Example, which uses a tensor gained in example for
-    :func:`images2neibs <pytensor.sandbox.neighbours.neibs2images>`:
-    .. code-block:: python
-        im_new = neibs2images(neibs, (5, 5), im_val.shape)
-        # PyTensor function definition
-        inv_window = pytensor.function([neibs], im_new)
-        # Function application
-        im_new_val = inv_window(neibs_val)
-    .. note:: The code will output the initial image array.
-    """
-    neibs = as_tensor_variable(neibs)
-    neib_shape = as_tensor_variable(neib_shape)
-    original_shape = as_tensor_variable(original_shape)
-    new_neib_shape = stack([original_shape[-1] // neib_shape[1], neib_shape[1]])
-    output_2d = images2neibs(
-        neibs.dimshuffle("x", "x", 0, 1), new_neib_shape, mode=mode
-    )
-    if mode == "ignore_borders":
-        # We use set_subtensor to accept original_shape we can't infer
-        # the shape and still raise error when it don't have the right
-        # shape.
-        valid_shape = original_shape
-        valid_shape = set_subtensor(
-            valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0]
-        )
-        valid_shape = set_subtensor(
-            valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1]
-        )
-        output_4d = output_2d.reshape(valid_shape, ndim=4)
-        # padding the borders with zeros
-        for d in (2, 3):
-            pad_shape = list(output_4d.shape)
-            pad_shape[d] = original_shape[d] - valid_shape[d]
-            output_4d = concatenate([output_4d, zeros(pad_shape)], axis=d)
-    elif mode == "valid":
-        # TODO: we do not implement all mode with this code.
-        # Add a check for the good cases.
-        output_4d = output_2d.reshape(original_shape, ndim=4)
-    else:
-        raise NotImplementedError(f"neibs2images do not support mode={mode}")
-    return output_4d
--- a/pytensor/tensor/nnet/opt.py
+++ b/pytensor/tensor/nnet/opt.py
-import warnings
-warnings.warn(
-    "The module `pytensor.tensor.nnet.opt` is deprecated; use `pytensor.tensor.nnet.rewriting` instead.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-from pytensor.tensor.nnet.rewriting import *  # noqa: F401 E402 F403
--- a/pytensor/tensor/nnet/rewriting.py
+++ b/pytensor/tensor/nnet/rewriting.py
-"""
-Optimizations addressing the ops in nnet root directory
-"""
-import pytensor
-from pytensor import compile
-from pytensor.compile import optdb
-from pytensor.configdefaults import config
-from pytensor.graph.rewriting.basic import (
-    MetaNodeRewriterSkip,
-    WalkingGraphRewriter,
-    copy_stack_trace,
-    in2out,
-    node_rewriter,
-)
-from pytensor.tensor.nnet.abstract_conv import (
-    AbstractConv2d,
-    AbstractConv2d_gradInputs,
-    AbstractConv2d_gradWeights,
-    AbstractConv3d,
-    AbstractConv3d_gradInputs,
-    AbstractConv3d_gradWeights,
-    get_conv_output_shape,
-)
-from pytensor.tensor.nnet.blocksparse import (
-    SparseBlockGemv,
-    SparseBlockOuter,
-    sparse_block_gemv_inplace,
-    sparse_block_outer_inplace,
-)
-# Cpu implementation
-from pytensor.tensor.nnet.conv import ConvOp, conv2d
-from pytensor.tensor.nnet.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights
-from pytensor.tensor.nnet.corr3d import (
-    Corr3dMM,
-    Corr3dMMGradInputs,
-    Corr3dMMGradWeights,
-)
-from pytensor.tensor.rewriting.basic import register_specialize_device
-from pytensor.tensor.type import TensorType
-@node_rewriter([SparseBlockGemv], inplace=True)
-def local_inplace_sparse_block_gemv(fgraph, node):
-    """
-    SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
-    """
-    if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
-        new_node = sparse_block_gemv_inplace(*node.inputs)
-        copy_stack_trace(node.outputs[0], new_node)
-        return [new_node]
-    return False
-compile.optdb.register(
-    "local_inplace_sparse_block_gemv",
-    WalkingGraphRewriter(
-        local_inplace_sparse_block_gemv,
-        failure_callback=WalkingGraphRewriter.warn_inplace,
-    ),
-    "fast_run",
-    "inplace",
-    position=60,
-)
-@node_rewriter([SparseBlockOuter], inplace=True)
-def local_inplace_sparse_block_outer(fgraph, node):
-    """
-    SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
-    """
-    if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
-        new_node = sparse_block_outer_inplace(*node.inputs)
-        copy_stack_trace(node.outputs[0], new_node)
-        return [new_node]
-    return False
-compile.optdb.register(
-    "local_inplace_sparse_block_outer",
-    WalkingGraphRewriter(
-        local_inplace_sparse_block_outer,
-        failure_callback=WalkingGraphRewriter.warn_inplace,
-    ),
-    "fast_run",
-    "inplace",
-    position=60,
-)
-# Conv opts
-@node_rewriter([AbstractConv2d])
-def local_abstractconv_gemm(fgraph, node):
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    if config.cxx == "" or node.inputs[0].dtype == "float16":
-        return
-    if not isinstance(node.op, AbstractConv2d):
-        return None
-    img, kern = node.inputs
-    if not isinstance(img.type, TensorType) or not isinstance(kern.type, TensorType):
-        return None
-    # need to flip the kernel if necessary
-    if node.op.filter_flip:
-        flip = (slice(None),) * (kern.ndim - 2) + (slice(None, None, -1),) * 2
-        kern = kern[flip]
-    rval = CorrMM(
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-        filter_dilation=node.op.filter_dilation,
-        num_groups=node.op.num_groups,
-        unshared=node.op.unshared,
-    )(img, kern)
-    copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv3d])
-def local_abstractconv3d_gemm(fgraph, node):
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    if config.cxx == "" or node.inputs[0].dtype == "float16":
-        return
-    if not isinstance(node.op, AbstractConv3d):
-        return None
-    img, kern = node.inputs
-    if not isinstance(img.type, TensorType) or not isinstance(kern.type, TensorType):
-        return None
-    # need to flip the kernel if necessary
-    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1, ::-1]
-    rval = Corr3dMM(
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-        filter_dilation=node.op.filter_dilation,
-        num_groups=node.op.num_groups,
-    )(img, kern)
-    copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv2d_gradWeights])
-def local_abstractconv_gradweight_gemm(fgraph, node):
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    if config.cxx == "" or node.inputs[0].dtype == "float16":
-        return
-    if not isinstance(node.op, AbstractConv2d_gradWeights):
-        return None
-    img, topgrad, shape = node.inputs
-    if not isinstance(img.type, TensorType) or not isinstance(topgrad.type, TensorType):
-        return None
-    rval = CorrMM_gradWeights(
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-        filter_dilation=node.op.filter_dilation,
-        num_groups=node.op.num_groups,
-        unshared=node.op.unshared,
-    )(img, topgrad, shape)
-    copy_stack_trace(node.outputs[0], rval)
-    # need to flip the kernel if necessary
-    if node.op.filter_flip:
-        flip = (slice(None),) * (rval.ndim - 2) + (slice(None, None, -1),) * 2
-        rval = rval[flip]
-        copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv3d_gradWeights])
-def local_abstractconv3d_gradweight_gemm(fgraph, node):
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    if config.cxx == "" or node.inputs[0].dtype == "float16":
-        return
-    if not isinstance(node.op, AbstractConv3d_gradWeights):
-        return None
-    img, topgrad, shape = node.inputs
-    if not isinstance(img.type, TensorType) or not isinstance(topgrad.type, TensorType):
-        return None
-    rval = Corr3dMMGradWeights(
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-        filter_dilation=node.op.filter_dilation,
-        num_groups=node.op.num_groups,
-    )(img, topgrad, shape)
-    copy_stack_trace(node.outputs[0], rval)
-    # need to flip the kernel if necessary
-    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1, ::-1]
-        copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv2d_gradInputs])
-def local_abstractconv_gradinputs_gemm(fgraph, node):
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    if config.cxx == "" or node.inputs[0].dtype == "float16":
-        return
-    if not isinstance(node.op, AbstractConv2d_gradInputs):
-        return None
-    kern, topgrad, shape = node.inputs
-    if not isinstance(kern.type, TensorType) or not isinstance(
-        topgrad.type, TensorType
-    ):
-        return None
-    # need to flip the kernel if necessary
-    if node.op.filter_flip:
-        flip = (slice(None),) * (kern.ndim - 2) + (slice(None, None, -1),) * 2
-        kern = kern[flip]
-    rval = CorrMM_gradInputs(
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-        filter_dilation=node.op.filter_dilation,
-        num_groups=node.op.num_groups,
-        unshared=node.op.unshared,
-    )(kern, topgrad, shape)
-    copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv3d_gradInputs])
-def local_abstractconv3d_gradinputs_gemm(fgraph, node):
-    # If config.blas__ldflags is empty, PyTensor will use
-    # a NumPy C implementation of [sd]gemm_.
-    if config.cxx == "" or node.inputs[0].dtype == "float16":
-        return
-    if not isinstance(node.op, AbstractConv3d_gradInputs):
-        return None
-    kern, topgrad, shape = node.inputs
-    if not isinstance(kern.type, TensorType) or not isinstance(
-        topgrad.type, TensorType
-    ):
-        return None
-    # need to flip the kernel if necessary
-    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1, ::-1]
-    rval = Corr3dMMGradInputs(
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-        filter_dilation=node.op.filter_dilation,
-        num_groups=node.op.num_groups,
-    )(kern, topgrad, shape)
-    copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv2d])
-def local_conv2d_cpu(fgraph, node):
-    if not isinstance(node.op, AbstractConv2d) or node.inputs[0].dtype == "float16":
-        return None
-    img, kern = node.inputs
-    if not isinstance(img.type, TensorType) or not isinstance(kern.type, TensorType):
-        return None
-    if node.op.border_mode not in ("full", "valid"):
-        return None
-    if not node.op.filter_flip:
-        # Not tested yet
-        return None
-    if node.op.num_groups > 1 or node.op.unshared:
-        return None
-    if node.op.filter_dilation != (1, 1):
-        return None
-    rval = conv2d(
-        img,
-        kern,
-        node.op.imshp,
-        node.op.kshp,
-        border_mode=node.op.border_mode,
-        subsample=node.op.subsample,
-    )
-    copy_stack_trace(node.outputs[0], rval)
-    return [rval]
-@node_rewriter([AbstractConv2d_gradWeights])
-def local_conv2d_gradweight_cpu(fgraph, node):
-    if (
-        not isinstance(node.op, AbstractConv2d_gradWeights)
-        or node.inputs[0].dtype == "float16"
-    ):
-        return None
-    img, topgrad, shape = node.inputs
-    if not isinstance(img.type, TensorType) or not isinstance(topgrad.type, TensorType):
-        return None
-    if node.op.border_mode not in ("full", "valid"):
-        return None
-    if not node.op.filter_flip:
-        # Not tested yet
-        return
-    if node.op.num_groups > 1 or node.op.unshared:
-        return None
-    if node.op.border_mode == "valid" and (node.op.subsample != (1, 1)):
-        return None
-    dx, dy = node.op.subsample
-    if dx not in (1, 2) or dy not in (1, 2):
-        # Not implemented in the gradient of ConvOp
-        return None
-    if node.op.imshp is None:
-        op_imshp = (None, None, None, None)
-    else:
-        op_imshp = node.op.imshp
-    if node.op.kshp is None:
-        op_kshp = (None, None, None, None)
-    else:
-        op_kshp = node.op.kshp
-    if None in op_imshp or None in op_kshp:
-        if (dx, dy) != (1, 1):
-            # We cannot infer the shapes
-            return None
-    # Determine gradient on kernels
-    assert len(op_imshp) == 4 and len(op_kshp) == 4
-    outshp = get_conv_output_shape(
-        op_imshp,
-        op_kshp,
-        node.op.border_mode,
-        node.op.subsample,
-        node.op.filter_dilation,
-    )[2:]
-    fulloutshp = get_conv_output_shape(op_imshp, op_kshp, node.op.border_mode, (1, 1))[
-        2:
-    ]
-    newimg = img.dimshuffle((1, 0, 2, 3))
-    newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
-    if node.op.border_mode == "valid":
-        (img, filters) = (newimg, newtopgrad)
-        kshp_logical = fulloutshp
-        kshp_logical_top_aligned = False
-        imshp_logical = None
-        (bsize, nkern) = (op_imshp[1], op_kshp[0])
-        imshp = (op_imshp[0], op_imshp[2], op_imshp[3])
-        kshp = outshp
-    elif node.op.border_mode == "full":
-        (img, filters) = (newtopgrad, newimg)
-        kshp_logical = None
-        kshp_logical_top_aligned = True
-        imshp_logical = (op_imshp[0], fulloutshp[0], fulloutshp[1])
-        (bsize, nkern) = (op_kshp[0], op_imshp[1])
-        imshp = (op_imshp[0], outshp[0], outshp[1])
-        kshp = op_imshp[2:]
-    else:
-        raise NotImplementedError("Only [full,valid] modes are currently supported.")
-    # Flip the kernels
-    filters = filters[:, :, ::-1, ::-1]
-    dw = ConvOp(
-        imshp,
-        kshp,
-        nkern,
-        bsize,
-        1,
-        1,
-        output_mode="valid",
-        unroll_batch=None,
-        unroll_kern=None,
-        unroll_patch=None,
-        imshp_logical=imshp_logical,
-        kshp_logical=kshp_logical,
-        kshp_logical_top_aligned=kshp_logical_top_aligned,
-        direction_hint="bprop weights",
-    )
-    res = dw(img, filters)
-    copy_stack_trace(node.outputs[0], res)
-    if node.op.border_mode == "valid":
-        res = res.dimshuffle((1, 0, 2, 3))
-        res = res[:, :, ::-1, ::-1]
-        copy_stack_trace(node.outputs[0], res)
-    return [res]
-@node_rewriter([AbstractConv2d_gradInputs])
-def local_conv2d_gradinputs_cpu(fgraph, node):
-    if (
-        not isinstance(node.op, AbstractConv2d_gradInputs)
-        or node.inputs[0].dtype == "float16"
-    ):
-        return None
-    kern, topgrad, shape = node.inputs
-    if not isinstance(kern.type, TensorType) or not isinstance(
-        topgrad.type, TensorType
-    ):
-        return None
-    if node.op.border_mode not in ("full", "valid"):
-        return None
-    if not node.op.filter_flip:
-        # Not tested yet
-        return None
-    if node.op.num_groups > 1 or node.op.unshared:
-        return None
-    # Conv 3d implementation, needed when subsample > 2
-    if node.op.border_mode == "valid" and node.op.subsample != (1, 1):
-        # The op don't support that anymore.
-        return False
-    # Conv2d Implementation
-    dx, dy = node.op.subsample
-    if dx not in (1, 2) or dy not in (1, 2):
-        # Not implemented in the gradient of ConvOp
-        return None
-    if node.op.imshp is None:
-        op_imshp = (None, None, None, None)
-    else:
-        op_imshp = node.op.imshp
-    if node.op.kshp is None:
-        op_kshp = (None, None, None, None)
-    else:
-        op_kshp = node.op.kshp
-    if None in op_imshp or None in op_kshp:
-        if (dx, dy) != (1, 1):
-            return None
-    mode = "valid"
-    if node.op.border_mode != "full":
-        mode = "full"
-    filters = kern.dimshuffle((1, 0, 2, 3))
-    filters = filters[:, :, ::-1, ::-1]
-    outshp = get_conv_output_shape(
-        op_imshp,
-        op_kshp,
-        node.op.border_mode,
-        node.op.subsample,
-        node.op.filter_dilation,
-    )[2:]
-    fulloutshp = get_conv_output_shape(op_imshp, op_kshp, node.op.border_mode, (1, 1))[
-        2:
-    ]
-    nkern = op_imshp[1]
-    imshp = (op_kshp[0], outshp[0], outshp[1])
-    imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
-    din = ConvOp(
-        imshp,
-        op_kshp[2:],
-        nkern,
-        op_imshp[0],
-        1,
-        1,
-        output_mode=mode,
-        unroll_batch=None,
-        unroll_kern=None,
-        unroll_patch=None,
-        imshp_logical=imshp_logical,
-        kshp_logical=None,
-        version=-1,
-        direction_hint="bprop inputs",
-    )
-    din = din(topgrad, filters)
-    copy_stack_trace(node.outputs[0], din)
-    return [din]
-# Register Cpu Optimization
-conv_groupopt = pytensor.graph.rewriting.db.LocalGroupDB()
-conv_groupopt.__name__ = "conv_opts"
-register_specialize_device(conv_groupopt, "fast_compile", "fast_run")
-# GEMM-based convolution
-# It can be disabled by excluding 'conv_gemm'.
-conv_groupopt.register(
-    "local_abstractconv_gemm",
-    local_abstractconv_gemm,
-    "conv_gemm",
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-conv_groupopt.register(
-    "local_abstractconv_gradweight_gemm",
-    local_abstractconv_gradweight_gemm,
-    "conv_gemm",
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-conv_groupopt.register(
-    "local_abstractconv_gradinputs_gemm",
-    local_abstractconv_gradinputs_gemm,
-    "conv_gemm",
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-conv_groupopt.register(
-    "local_abstractconv3d_gemm",
-    local_abstractconv3d_gemm,
-    "conv_gemm",
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-conv_groupopt.register(
-    "local_abstractconv3d_gradweight_gemm",
-    local_abstractconv3d_gradweight_gemm,
-    "conv_gemm",
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-conv_groupopt.register(
-    "local_abstractconv3d_gradinputs_gemm",
-    local_abstractconv3d_gradinputs_gemm,
-    "conv_gemm",
-    "fast_compile",
-    "fast_run",
-    position=30,
-)
-# Legacy convolution
-conv_groupopt.register(
-    "local_conv2d_cpu", local_conv2d_cpu, "fast_compile", "fast_run", position=40
-)
-conv_groupopt.register(
-    "local_conv2d_gradweight_cpu",
-    local_conv2d_gradweight_cpu,
-    "fast_compile",
-    "fast_run",
-    position=40,
-)
-conv_groupopt.register(
-    "local_conv2d_gradinputs_cpu",
-    local_conv2d_gradinputs_cpu,
-    "fast_compile",
-    "fast_run",
-    position=40,
-)
-# Verify that no AbstractConv are present in the graph
-@node_rewriter(
-    [
-        AbstractConv2d,
-        AbstractConv2d_gradWeights,
-        AbstractConv2d_gradInputs,
-        AbstractConv3d,
-        AbstractConv3d_gradWeights,
-        AbstractConv3d_gradInputs,
-    ]
-)
-def local_abstractconv_check(fgraph, node):
-    if isinstance(
-        node.op,
-        (
-            AbstractConv2d,
-            AbstractConv2d_gradWeights,
-            AbstractConv2d_gradInputs,
-            AbstractConv3d,
-            AbstractConv3d_gradWeights,
-            AbstractConv3d_gradInputs,
-        ),
-    ):
-        raise MetaNodeRewriterSkip(
-            f"{node.op.__class__.__name__} PyTensor rewriting failed: there is no implementation "
-            "available supporting the requested options. If on CPU, "
-            "do you have a BLAS library installed PyTensor can link against? "
-            "On the CPU we do not support float16."
-        )
-optdb.register(
-    "AbstractConvCheck",
-    in2out(local_abstractconv_check, name="AbstractConvCheck"),
-    "fast_compile",
-    "fast_run",
-    position=48.7,
-)
--- a/pytensor/tensor/nnet/sigm.py
+++ b/pytensor/tensor/nnet/sigm.py
-"""
-These functions implement special cases of exp and log to improve numerical
-stability.
-"""
-import pytensor
-from pytensor import printing
-from pytensor import scalar as aes
-from pytensor.graph.rewriting.basic import copy_stack_trace, node_rewriter
-from pytensor.printing import pprint
-from pytensor.scalar import sigmoid as scalar_sigmoid
-from pytensor.scalar.math import Sigmoid
-from pytensor.tensor.basic import constant
-from pytensor.tensor.elemwise import Elemwise
-from pytensor.tensor.math import clip, sigmoid
-from pytensor.tensor.type import TensorType
-class UltraFastScalarSigmoid(aes.UnaryScalarOp):
-    """
-    This is just speed opt. Not for stability.
-    """
-    nfunc_spec = ("scipy.special.expit", 1, 1)
-    @staticmethod
-    def st_impl(x):
-        x = 0.5 * x
-        # The if is a tanh approximate.
-        if x >= 0:
-            if x < 1.7:
-                z = 1.5 * x / (1 + x)
-            elif x < 3:
-                z = 0.935409070603099 + 0.0458812946797165 * (x - 1.7)
-            else:
-                z = 0.99505475368673
-        else:
-            xx = -x
-            if xx < 1.7:
-                z = 1.5 * xx / (1 + xx)
-            elif xx < 3:
-                z = 0.935409070603099 + 0.0458812946797165 * (xx - 1.7)
-            else:
-                z = 0.99505475368673
-            z = -z
-        return 0.5 * (z + 1.0)
-    def impl(self, x):
-        return UltraFastScalarSigmoid.st_impl(x)
-    def c_code(self, node, name, inp, out, sub):
-        (x,) = inp
-        (z,) = out
-        dtype = node.outputs[0].type.dtype_specs()[1]
-        return (
-            """{
-        %(dtype)s x = 0.5 * %(x)s;
-   // The if is a tanh approximate.
-   if(x>=0) {
-        %(z)s = (x<1.7 ? (1.5*x/(1+x)) :
-                         (x<3 ? (0.935409070603099 + 0.0458812946797165*(x-1.7)):
-                         0.99505475368673));
-    } else {
-        %(dtype)s xx = -x;
-        %(z)s = -(xx<1.7 ? (1.5*xx/(1+xx)) :
-                           (xx<3 ? (0.935409070603099 + 0.0458812946797165*(xx-1.7)):
-                                   0.99505475368673));
-    }
-        //%(z)s = 0.5*(ultrafasttanh(0.5*x)+1.);
-        %(z)s = 0.5*(%(z)s+1.);
-        }"""
-            % locals()
-        )
-    @staticmethod
-    def c_code_cache_version():
-        return (5,)
-ultra_fast_scalar_sigmoid = UltraFastScalarSigmoid(
-    aes.upgrade_to_float, name="ultra_fast_scalar_sigmoid"
-)
-ultra_fast_sigmoid = Elemwise(ultra_fast_scalar_sigmoid, name="ultra_fast_sigmoid")
-ultra_fast_sigmoid_inplace = Elemwise(
-    UltraFastScalarSigmoid(aes.transfer_type(0)),
-    inplace_pattern={0: 0},
-    name="ultra_fast_sigmoid_inplace",
-)
-pprint.assign(ultra_fast_sigmoid, printing.FunctionPrinter(["ultra_fast_sigmoid"]))
-@node_rewriter(None)
-def local_ultra_fast_sigmoid(fgraph, node):
-    """
-    When enabled, change all sigmoid to ultra_fast_sigmoid.
-    For example do mode.including('local_ultra_fast_sigmoid')
-    or use the PyTensor flag optimizer_including=local_ultra_fast_sigmoid.
-    This speeds up the sigmoid op by using an approximation.
-    This is done after the stabilization and specialize phases
-    to avoid interacting with them.
-    """
-    if isinstance(node.op, Elemwise) and isinstance(node.op.scalar_op, Sigmoid):
-        if node.op.inplace_pattern:
-            out = ultra_fast_sigmoid_inplace(node.inputs[0])
-        else:
-            out = ultra_fast_sigmoid(node.inputs[0])
-        copy_stack_trace(node.outputs[0], out)
-        def values_eq_approx_remove_low_prec(a, b):
-            # atol is found by trial/error.
-            # Other test could fail without good reason.
-            return TensorType.values_eq_approx(a, b, atol=0.02)
-        # Let DebugMode know that there this opt approx the values.
-        out.tag.values_eq_approx = values_eq_approx_remove_low_prec
-        return [out]
-pytensor.compile.optdb["uncanonicalize"].register(
-    "local_ultra_fast_sigmoid", local_ultra_fast_sigmoid
-)
-def hard_sigmoid(x):
-    """
-    An approximation of sigmoid.
-    More approximate and faster than ultra_fast_sigmoid.
-    Approx in 3 parts: 0, scaled linear, 1.
-    Removing the slope and shift does not make it faster.
-    """
-    # Use the same dtype as determined by "upgrade_to_float",
-    # and perform computation in that dtype.
-    out_dtype = aes.upgrade_to_float(aes.ScalarType(dtype=x.dtype))[0].dtype
-    slope = constant(0.2, dtype=out_dtype)
-    shift = constant(0.5, dtype=out_dtype)
-    x = (x * slope) + shift
-    x = clip(x, 0, 1)
-    return x
-@node_rewriter([sigmoid])
-def local_hard_sigmoid(fgraph, node):
-    if isinstance(node.op, Elemwise) and node.op.scalar_op == scalar_sigmoid:
-        out = hard_sigmoid(node.inputs[0])
-        copy_stack_trace(node.outputs[0], out)
-        def values_eq_approx_remove_low_prec(a, b):
-            # atol is found by trial/error.
-            # Other test could fail without good reason.
-            return TensorType.values_eq_approx(a, b, atol=0.1)
-        # Let DebugMode know that there this opt approx the values.
-        out.tag.values_eq_approx = values_eq_approx_remove_low_prec
-        return [out]
-pytensor.compile.optdb["uncanonicalize"].register(
-    "local_hard_sigmoid", local_hard_sigmoid
-)
--- a/tests/tensor/nnet/__init__.py
+++ b/tests/tensor/nnet/__init__.py
--- a/pytensor/tensor/nnet/c_code/corr3d_gemm.c
+++ b/pytensor/tensor/nnet/c_code/corr3d_gemm.c
--- a/pytensor/tensor/nnet/c_code/corr_gemm.c
+++ b/pytensor/tensor/nnet/c_code/corr_gemm.c
--- a/pytensor/tensor/nnet/corr3d.py
+++ b/pytensor/tensor/nnet/corr3d.py
@@ -13,7 +13,7 @@ from pytensor.scalar import int64
 from pytensor.tensor import blas_headers
 from pytensor.tensor.basic import as_tensor_variable
 from pytensor.tensor.blas import blas_header_version, ldflags
-from pytensor.tensor.nnet.abstract_conv import get_conv_output_shape
+from pytensor.tensor.conv.abstract_conv import get_conv_output_shape
 from pytensor.tensor.type import TensorType

--- a/pytensor/tensor/nnet/corr.py
+++ b/pytensor/tensor/nnet/corr.py
@@ -13,7 +13,7 @@ from pytensor.scalar import int8, int64
 from pytensor.tensor import blas_headers
 from pytensor.tensor.basic import as_tensor_variable
 from pytensor.tensor.blas import blas_header_version, ldflags
-from pytensor.tensor.nnet.abstract_conv import get_conv_output_shape
+from pytensor.tensor.conv.abstract_conv import get_conv_output_shape
 from pytensor.tensor.type import TensorType

--- a/tests/tensor/nnet/test_abstract_conv.py
+++ b/tests/tensor/nnet/test_abstract_conv.py
@@ -6,9 +6,8 @@ import pytensor.tensor as at
 from pytensor.compile.mode import Mode
 from pytensor.configdefaults import config
 from pytensor.graph.rewriting.basic import check_stack_trace
-from pytensor.tensor.nnet import abstract_conv as conv
+from pytensor.tensor.conv import abstract_conv
-from pytensor.tensor.nnet import conv2d_transpose, corr, corr3d
+from pytensor.tensor.conv.abstract_conv import (
-from pytensor.tensor.nnet.abstract_conv import (
    AbstractConv2d,
    AbstractConv2d_gradInputs,
    AbstractConv2d_gradWeights,
@@ -19,18 +18,13 @@ from pytensor.tensor.nnet.abstract_conv import (
    bilinear_upsampling,
    causal_conv1d,
    check_conv_gradinputs_shape,
+    conv2d_transpose,
    get_conv_gradinputs_shape,
    get_conv_gradweights_shape,
    get_conv_output_shape,
    separable_conv2d,
    separable_conv3d,
 )
-from pytensor.tensor.nnet.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights
-from pytensor.tensor.nnet.corr3d import (
-    Corr3dMM,
-    Corr3dMMGradInputs,
-    Corr3dMMGradWeights,
-)
 from pytensor.tensor.type import (
    TensorType,
    ftensor4,
@@ -41,6 +35,7 @@ from pytensor.tensor.type import (
    tensor5,
 )
 from tests import unittest_tools as utt
+from tests.tensor.conv import c_conv3d_corr3d_ref, c_conv_corr_ref
 def conv2d_corr(
@@ -53,7 +48,9 @@ def conv2d_corr(
 ):
    if conv_mode == "conv":
        filters = filters[:, :, ::-1, ::-1]
-    return corr.CorrMM(border_mode, subsample, filter_dilation)(inputs, filters)
+    return c_conv_corr_ref.CorrMM(border_mode, subsample, filter_dilation)(
+        inputs, filters
+    )
 def conv2d_corr_gw(
@@ -65,7 +62,7 @@ def conv2d_corr_gw(
    conv_mode="conv",
    filter_dilation=(1, 1),
 ):
-    rval = corr.CorrMM_gradWeights(border_mode, subsample, filter_dilation)(
+    rval = c_conv_corr_ref.CorrMM_gradWeights(border_mode, subsample, filter_dilation)(
        inputs, topgrad, filters_shape[2:]
    )
    if conv_mode == "conv":
@@ -84,7 +81,7 @@ def conv2d_corr_gi(
 ):
    if conv_mode == "conv":
        filters = filters[:, :, ::-1, ::-1]
-    return corr.CorrMM_gradInputs(border_mode, subsample, filter_dilation)(
+    return c_conv_corr_ref.CorrMM_gradInputs(border_mode, subsample, filter_dilation)(
        filters, topgrad, inputs_shape[2:]
    )
@@ -99,7 +96,9 @@ def conv3d_corr(
 ):
    if conv_mode == "conv":
        filters = filters[:, :, ::-1, ::-1, ::-1]
-    return corr3d.Corr3dMM(border_mode, subsample, filter_dilation)(inputs, filters)
+    return c_conv3d_corr3d_ref.Corr3dMM(border_mode, subsample, filter_dilation)(
+        inputs, filters
+    )
 def conv3d_corr_gw(
@@ -111,9 +110,9 @@ def conv3d_corr_gw(
    conv_mode="conv",
    filter_dilation=(1, 1, 1),
 ):
-    rval = corr3d.Corr3dMMGradWeights(border_mode, subsample, filter_dilation)(
+    rval = c_conv3d_corr3d_ref.Corr3dMMGradWeights(
-        inputs, topgrad, filters_shape[2:]
+        border_mode, subsample, filter_dilation
-    )
+    )(inputs, topgrad, filters_shape[2:])
    if conv_mode == "conv":
        rval = rval[:, :, ::-1, ::-1, ::-1]
    return rval
@@ -130,9 +129,9 @@ def conv3d_corr_gi(
 ):
    if conv_mode == "conv":
        filters = filters[:, :, ::-1, ::-1, ::-1]
-    return corr3d.Corr3dMMGradInputs(border_mode, subsample, filter_dilation)(
+    return c_conv3d_corr3d_ref.Corr3dMMGradInputs(
-        filters, topgrad, inputs_shape[2:]
+        border_mode, subsample, filter_dilation
-    )
+    )(filters, topgrad, inputs_shape[2:])
 class TestGetConvOutShape:
@@ -338,7 +337,7 @@ class TestAssertShape:
        input = tensor4()
        filters = tensor4()
-        out = conv.abstract_conv2d(
+        out = abstract_conv.abstract_conv2d(
            input, filters, input_shape=(3, 5, 7, 11), filter_shape=(7, 5, 3, 3)
        )
        f = pytensor.function([input, filters], out)
@@ -361,7 +360,7 @@ class TestAssertShape:
        input = tensor5()
        filters = tensor5()
-        out = conv.conv3d(
+        out = abstract_conv.conv3d(
            input, filters, input_shape=(3, 5, 7, 11, 13), filter_shape=(7, 5, 3, 3, 3)
        )
        f = pytensor.function([input, filters], out)
@@ -383,7 +382,7 @@ class TestAssertShape:
        output_grad = tensor4()
        filters = tensor4()
-        out = conv.conv2d_grad_wrt_inputs(
+        out = abstract_conv.conv2d_grad_wrt_inputs(
            output_grad,
            filters,
            input_shape=(None, None, 7, 11),
@@ -403,7 +402,7 @@ class TestAssertShape:
        output_grad = tensor5()
        filters = tensor5()
-        out = conv.conv3d_grad_wrt_inputs(
+        out = abstract_conv.conv3d_grad_wrt_inputs(
            output_grad,
            filters,
            input_shape=(None, None, 7, 11, 13),
@@ -422,7 +421,7 @@ class TestAssertShape:
        input = tensor4()
        output_grad = tensor4()
-        out = conv.conv2d_grad_wrt_weights(
+        out = abstract_conv.conv2d_grad_wrt_weights(
            input,
            output_grad,
            filter_shape=(None, None, 3, 3),
@@ -442,7 +441,7 @@ class TestAssertShape:
        input = tensor5()
        output_grad = tensor5()
-        out = conv.conv3d_grad_wrt_weights(
+        out = abstract_conv.conv3d_grad_wrt_weights(
            input,
            output_grad,
            filter_shape=(None, None, 3, 3, 3),
@@ -892,8 +891,8 @@ class BaseTestConv2d(BaseTestConv):
        self,
        inputs_shape,
        filters_shape,
-        conv_fn=conv.abstract_conv2d,
+        conv_fn=abstract_conv.abstract_conv2d,
-        conv_op=conv.AbstractConv2d,
+        conv_op=abstract_conv.AbstractConv2d,
        ref=conv2d_corr,
        **kwargs,
    ):
@@ -911,7 +910,7 @@ class BaseTestConv2d(BaseTestConv):
        inputs_shape,
        filters_shape,
        output_shape,
-        gradWeights_fn=conv.AbstractConv2d_gradWeights,
+        gradWeights_fn=abstract_conv.AbstractConv2d_gradWeights,
        ref=conv2d_corr_gw,
        **kwargs,
    ):
@@ -929,7 +928,7 @@ class BaseTestConv2d(BaseTestConv):
        inputs_shape,
        filters_shape,
        output_shape,
-        gradInputs_fn=conv.AbstractConv2d_gradInputs,
+        gradInputs_fn=abstract_conv.AbstractConv2d_gradInputs,
        ref=conv2d_corr_gi,
        **kwargs,
    ):
@@ -943,96 +942,6 @@ class BaseTestConv2d(BaseTestConv):
        )
-@pytest.mark.skipif(
-    not config.cxx or config.mode == "FAST_COMPILE",
-    reason="Need blas to test conv2d",
-)
-class TestCorrConv2d(BaseTestConv2d):
-    @classmethod
-    def setup_class(cls):
-        # This tests can run even when config.blas__ldflags is empty.
-        super().setup_class()
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
-        o = self.get_output_shape(i, f, s, b, fd)
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=CorrMM,
-            check_trace=True,
-            filter_dilation=fd,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=CorrMM_gradWeights,
-            check_trace=True,
-            filter_dilation=fd,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=CorrMM_gradInputs,
-            check_trace=True,
-            filter_dilation=fd,
-        )
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
-    ):
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=CorrMM_gradInputs,
-                check_trace=True,
-                filter_dilation=fd,
-            )
-        else:
-            with pytest.raises(ValueError):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=CorrMM_gradInputs,
-                    ref=None,
-                    check_trace=True,
-                    filter_dilation=fd,
-                )
-    @pytest.mark.slow
-    def test_all(self):
-        super().test_all()
 @pytest.mark.skipif(
    config.cxx == "",
    reason="SciPy and cxx needed",
@@ -1252,8 +1161,8 @@ class BaseTestConv3d(BaseTestConv):
        self,
        inputs_shape,
        filters_shape,
-        conv_fn=conv.conv3d,
+        conv_fn=abstract_conv.conv3d,
-        conv_op=conv.AbstractConv3d,
+        conv_op=abstract_conv.AbstractConv3d,
        ref=conv3d_corr,
        **kwargs,
    ):
@@ -1271,7 +1180,7 @@ class BaseTestConv3d(BaseTestConv):
        inputs_shape,
        filters_shape,
        output_shape,
-        gradWeights_fn=conv.AbstractConv3d_gradWeights,
+        gradWeights_fn=abstract_conv.AbstractConv3d_gradWeights,
        ref=conv3d_corr_gw,
        **kwargs,
    ):
@@ -1289,7 +1198,7 @@ class BaseTestConv3d(BaseTestConv):
        inputs_shape,
        filters_shape,
        output_shape,
-        gradInputs_fn=conv.AbstractConv3d_gradInputs,
+        gradInputs_fn=abstract_conv.AbstractConv3d_gradInputs,
        ref=conv3d_corr_gi,
        **kwargs,
    ):
@@ -1303,94 +1212,6 @@ class BaseTestConv3d(BaseTestConv):
        )
-@pytest.mark.skipif(
-    not config.cxx or config.mode == "FAST_COMPILE",
-    reason="Need blas to test conv3d",
-)
-class TestCorrConv3d(BaseTestConv3d):
-    @classmethod
-    def setup_class(cls):
-        # This tests can run even when config.blas__ldflags is empty.
-        super().setup_class()
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
-        o = self.get_output_shape(i, f, s, b, fd)
-        # This test can run even when config.blas__ldflags is empty.
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=Corr3dMM,
-            check_trace=True,
-            filter_dilation=fd,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=Corr3dMMGradWeights,
-            check_trace=True,
-            filter_dilation=fd,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=Corr3dMMGradInputs,
-            check_trace=True,
-            filter_dilation=fd,
-        )
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False
-    ):
-        # This test can run even when config.blas__ldflags is empty.
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=Corr3dMMGradInputs,
-                check_trace=True,
-                filter_dilation=fd,
-            )
-        else:
-            with pytest.raises(ValueError):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=Corr3dMMGradInputs,
-                    ref=None,
-                    check_trace=True,
-                    filter_dilation=fd,
-                )
 def test_constant_shapes():
    # Check that the `imshp` and `kshp` parameters of the AbstractConv Ops
    # are rejected if not constant or None
@@ -1451,7 +1272,7 @@ class TestConvTypes:
        out_shape = lvector()
-        output = conv.abstract_conv2d(input, filters)
+        output = abstract_conv.abstract_conv2d(input, filters)
        grad_input, grad_filters = pytensor.grad(output.sum(), wrt=(input, filters))
        assert grad_input.type == input.type, (
            grad_input,
@@ -1466,7 +1287,9 @@ class TestConvTypes:
            filters.type,
        )
-        grad_filters = conv.AbstractConv2d_gradWeights()(input, topgrad, out_shape)
+        grad_filters = abstract_conv.AbstractConv2d_gradWeights()(
+            input, topgrad, out_shape
+        )
        grad_input, grad_topgrad = pytensor.grad(
            grad_filters.sum(), wrt=(input, topgrad)
        )
@@ -1484,7 +1307,9 @@ class TestConvTypes:
            topgrad.type,
        )
-        grad_input = conv.AbstractConv2d_gradInputs()(filters, topgrad, out_shape)
+        grad_input = abstract_conv.AbstractConv2d_gradInputs()(
+            filters, topgrad, out_shape
+        )
        grad_filters, grad_topgrad = pytensor.grad(
            grad_input.sum(), wrt=(filters, topgrad)
        )
@@ -1511,7 +1336,7 @@ class TestConvTypes:
        out_shape = lvector()
        # Check the forward Op
-        output = conv.abstract_conv2d(constant_tensor, filters)
+        output = abstract_conv.abstract_conv2d(constant_tensor, filters)
        grad_filters = pytensor.grad(output.sum(), wrt=filters)
        assert filters.type.is_super(grad_filters.type), (
            grad_filters,
@@ -1520,7 +1345,7 @@ class TestConvTypes:
            filters.type,
        )
-        output = conv.abstract_conv2d(input, constant_tensor)
+        output = abstract_conv.abstract_conv2d(input, constant_tensor)
        grad_input = pytensor.grad(output.sum(), wrt=input)
        assert input.type.is_super(grad_input.type), (
            grad_input,
@@ -1530,7 +1355,7 @@ class TestConvTypes:
        )
        # Check grad wrt weights
-        grad_filters = conv.AbstractConv2d_gradWeights()(
+        grad_filters = abstract_conv.AbstractConv2d_gradWeights()(
            constant_tensor, topgrad, out_shape
        )
        grad_topgrad = pytensor.grad(grad_filters.sum(), wrt=topgrad)
@@ -1541,7 +1366,7 @@ class TestConvTypes:
            topgrad.type,
        )
-        grad_filters = conv.AbstractConv2d_gradWeights()(
+        grad_filters = abstract_conv.AbstractConv2d_gradWeights()(
            input, constant_tensor, out_shape
        )
        grad_input = pytensor.grad(grad_filters.sum(), wrt=input)
@@ -1553,7 +1378,7 @@ class TestConvTypes:
        )
        # Check grad wrt inputs
-        grad_input = conv.AbstractConv2d_gradInputs()(
+        grad_input = abstract_conv.AbstractConv2d_gradInputs()(
            constant_tensor, topgrad, out_shape
        )
        grad_topgrad = pytensor.grad(grad_input.sum(), wrt=topgrad)
@@ -1564,7 +1389,7 @@ class TestConvTypes:
            topgrad.type,
        )
-        grad_input = conv.AbstractConv2d_gradInputs()(
+        grad_input = abstract_conv.AbstractConv2d_gradInputs()(
            filters, constant_tensor, out_shape
        )
        grad_filters = pytensor.grad(grad_input.sum(), wrt=filters)
@@ -1923,13 +1748,13 @@ class TestConv2dGrads:
        self.output_grad = tensor4()
        self.output_grad_wrt = tensor4()
-        self.x = tensor4("x", config.floatX)  # inputs
+        self.x = tensor4("x", dtype=config.floatX)  # inputs
-        self.w = tensor4("w", config.floatX)  # filter weights
+        self.w = tensor4("w", dtype=config.floatX)  # filter weights
    def test_conv2d_grad_wrt_inputs(self):
        # Compares calculated abstract grads wrt inputs with the fwd grads
        # This method checks the outputs of `conv2_grad_wrt_inputs` against
-        # the outputs of `pytensor.tensor.nnet.conv` forward grads to make sure the
+        # the outputs of `pytensor.tensor.conv` forward grads to make sure the
        # results are the same.
        for (in_shape, fltr_shape) in zip(self.inputs_shapes, self.filters_shapes):
@@ -1942,18 +1767,16 @@ class TestConv2dGrads:
                        filter_val = self.random_stream.random(fltr_shape).astype(
                            config.floatX
                        )
-                        out_grad_shape = (
+                        out_grad_shape = abstract_conv.get_conv_output_shape(
-                            pytensor.tensor.nnet.abstract_conv.get_conv_output_shape(
+                            image_shape=in_shape,
-                                image_shape=in_shape,
+                            kernel_shape=fltr_shape,
-                                kernel_shape=fltr_shape,
+                            border_mode=bm,
-                                border_mode=bm,
+                            subsample=ss,
-                                subsample=ss,
-                            )
                        )
                        out_grad_val = self.random_stream.random(out_grad_shape).astype(
                            config.floatX
                        )
-                        conv_out = pytensor.tensor.nnet.conv2d(
+                        conv_out = abstract_conv.conv2d(
                            self.x,
                            filters=self.w,
                            border_mode=bm,
@@ -1971,16 +1794,14 @@ class TestConv2dGrads:
                            [self.x, self.w, self.output_grad], conv_grad
                        )
-                        conv_wrt_i_out = (
+                        conv_wrt_i_out = abstract_conv.conv2d_grad_wrt_inputs(
-                            pytensor.tensor.nnet.abstract_conv.conv2d_grad_wrt_inputs(
+                            output_grad=self.output_grad_wrt,
-                                output_grad=self.output_grad_wrt,
+                            filters=self.w,
-                                filters=self.w,
+                            border_mode=bm,
-                                border_mode=bm,
+                            subsample=ss,
-                                subsample=ss,
+                            input_shape=in_shape,
-                                input_shape=in_shape,
+                            filter_shape=fltr_shape,
-                                filter_shape=fltr_shape,
+                            filter_flip=ff,
-                                filter_flip=ff,
-                            )
                        )
                        f_new = pytensor.function(
                            [self.w, self.output_grad_wrt], conv_wrt_i_out
@@ -1995,7 +1816,7 @@ class TestConv2dGrads:
    def test_conv2d_grad_wrt_weights(self):
        # Compares calculated abstract grads wrt weights with the fwd grads
        # This method checks the outputs of `conv2_grad_wrt_weights` against
-        # the outputs of `pytensor.tensor.nnet.conv` forward grads to make sure the
+        # the outputs of `pytensor.tensor.conv` forward grads to make sure the
        # results are the same.
        for (in_shape, fltr_shape) in zip(self.inputs_shapes, self.filters_shapes):
@@ -2008,18 +1829,16 @@ class TestConv2dGrads:
                        filter_val = self.random_stream.random(fltr_shape).astype(
                            config.floatX
                        )
-                        out_grad_shape = (
+                        out_grad_shape = abstract_conv.get_conv_output_shape(
-                            pytensor.tensor.nnet.abstract_conv.get_conv_output_shape(
+                            image_shape=in_shape,
-                                image_shape=in_shape,
+                            kernel_shape=fltr_shape,
-                                kernel_shape=fltr_shape,
+                            border_mode=bm,
-                                border_mode=bm,
+                            subsample=ss,
-                                subsample=ss,
-                            )
                        )
                        out_grad_val = self.random_stream.random(out_grad_shape).astype(
                            config.floatX
                        )
-                        conv_out = pytensor.tensor.nnet.conv2d(
+                        conv_out = abstract_conv.conv2d(
                            self.x,
                            filters=self.w,
                            border_mode=bm,
@@ -2037,16 +1856,14 @@ class TestConv2dGrads:
                            [self.x, self.w, self.output_grad], conv_grad
                        )
-                        conv_wrt_w_out = (
+                        conv_wrt_w_out = abstract_conv.conv2d_grad_wrt_weights(
-                            pytensor.tensor.nnet.abstract_conv.conv2d_grad_wrt_weights(
+                            self.x,
-                                self.x,
+                            output_grad=self.output_grad_wrt,
-                                output_grad=self.output_grad_wrt,
+                            border_mode=bm,
-                                border_mode=bm,
+                            subsample=ss,
-                                subsample=ss,
+                            input_shape=in_shape,
-                                input_shape=in_shape,
+                            filter_shape=fltr_shape,
-                                filter_shape=fltr_shape,
+                            filter_flip=ff,
-                                filter_flip=ff,
-                            )
                        )
                        f_new = pytensor.function(
                            [self.x, self.output_grad_wrt], conv_wrt_w_out
@@ -2062,12 +1879,12 @@ class TestConv2dGrads:
    reason="SciPy and cxx needed",
 )
 class TestGroupedConvNoOptim:
-    conv = pytensor.tensor.nnet.abstract_conv.AbstractConv2d
+    conv = abstract_conv.AbstractConv2d
-    conv_gradw = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv_gradw = abstract_conv.AbstractConv2d_gradWeights
-    conv_gradi = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv_gradi = abstract_conv.AbstractConv2d_gradInputs
-    conv_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d
+    conv_op = abstract_conv.AbstractConv2d
-    conv_gradw_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv_gradw_op = abstract_conv.AbstractConv2d_gradWeights
-    conv_gradi_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv_gradi_op = abstract_conv.AbstractConv2d_gradInputs
    mode = Mode(optimizer=None)
    is_dnn = False
@@ -2266,12 +2083,12 @@ class TestGroupedConvNoOptim:
    reason="SciPy and cxx needed",
 )
 class TestGroupedConv3dNoOptim(TestGroupedConvNoOptim):
-    conv = pytensor.tensor.nnet.abstract_conv.AbstractConv3d
+    conv = abstract_conv.AbstractConv3d
-    conv_gradw = pytensor.tensor.nnet.abstract_conv.AbstractConv3d_gradWeights
+    conv_gradw = abstract_conv.AbstractConv3d_gradWeights
-    conv_gradi = pytensor.tensor.nnet.abstract_conv.AbstractConv3d_gradInputs
+    conv_gradi = abstract_conv.AbstractConv3d_gradInputs
-    conv_op = pytensor.tensor.nnet.abstract_conv.AbstractConv3d
+    conv_op = abstract_conv.AbstractConv3d
-    conv_gradw_op = pytensor.tensor.nnet.abstract_conv.AbstractConv3d_gradWeights
+    conv_gradw_op = abstract_conv.AbstractConv3d_gradWeights
-    conv_gradi_op = pytensor.tensor.nnet.abstract_conv.AbstractConv3d_gradInputs
+    conv_gradi_op = abstract_conv.AbstractConv3d_gradInputs
    mode = Mode(optimizer=None)
    def setup_method(self):
@@ -2505,12 +2322,12 @@ class TestSeparableConv:
    reason="SciPy and cxx needed",
 )
 class TestUnsharedConv:
-    conv2d = pytensor.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d = abstract_conv.AbstractConv2d
-    conv2d_gradw = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradw = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_gradi = abstract_conv.AbstractConv2d_gradInputs
-    conv2d_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_op = abstract_conv.AbstractConv2d
-    conv2d_gradw_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradw_op = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_gradi_op = abstract_conv.AbstractConv2d_gradInputs
    mode = Mode(optimizer="None")
@@ -2733,12 +2550,12 @@ class TestUnsharedConv:
 class TestAsymmetricPadding:
-    conv2d = pytensor.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d = abstract_conv.AbstractConv2d
-    conv2d_gradw = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradw = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_gradi = abstract_conv.AbstractConv2d_gradInputs
-    conv2d_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_op = abstract_conv.AbstractConv2d
-    conv2d_gradw_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradw_op = abstract_conv.AbstractConv2d_gradWeights
-    conv2d_gradi_op = pytensor.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_gradi_op = abstract_conv.AbstractConv2d_gradInputs
    mode = Mode(optimizer="None")

--- a/tests/tensor/nnet/speed_test_conv.py
+++ b/tests/tensor/nnet/speed_test_conv.py
-import time
-import numpy as np
-from pytensor import function
-from pytensor.compile.mode import Mode
-from pytensor.tensor.nnet.conv import ConvOp
-from pytensor.tensor.type import TensorType, dmatrix
-def flip(kern, kshp):
-    "flip the kernel as scipy.convolv2d do it flipped."
-    flip = np.zeros(kern.shape)
-    if len(kern.shape) == 2:
-        kern = kern.reshape(-1)
-        it = reversed(kern)
-        for i in range(kshp[0]):
-            for j in range(kshp[1]):
-                flip[i, j] = next(it)
-    elif len(kern.shape) == 3:
-        kern = kern.reshape(kern.shape[0], -1)
-        for k in range(kern.shape[0]):
-            it = reversed(kern[k, :])
-            for i in range(kshp[0]):
-                for j in range(kshp[1]):
-                    flip[k, i, j] = next(it)
-    elif len(kern.shape) == 4:
-        kern = kern.reshape(kern.shape[0], kern.shape[1], -1)
-        for k in range(kern.shape[0]):
-            for m in range(kern.shape[1]):
-                it = reversed(kern[k, m, :])
-                for i in range(kshp[0]):
-                    for j in range(kshp[1]):
-                        flip[k, m, i, j] = next(it)
-    else:
-        raise NotImplementedError()
-    return flip
-global_rng = np.random.default_rng(3423489)
-dmatrix4 = TensorType("float64", shape=(None, None, None, None))
-def exec_multilayer_conv_nnet_old(
-    conv_mode,
-    ss,
-    bsize,
-    imshp,
-    kshps,
-    nkerns,
-    unroll_batch=0,
-    unroll_kern=0,
-    img=None,
-    validate=True,
-    conv_op_py=False,
-    do_print=True,
-    repeat=1,
-    unroll_patch=False,
-    unroll_patch_size=False,
-    verbose=0,
-):
-    if img is None:
-        img = dmatrix()
-    # build actual input images
-    imgval = global_rng.random((bsize, imshp[0], imshp[1], imshp[2]))
-    a = dmatrix()
-    kerns = [a for i in nkerns]
-    inputs4 = dmatrix4()
-    kerns4 = dmatrix4()
-    # for each layer
-    ntot = 0
-    tctot = 0
-    tpytot = 0
-    for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))):
-        if do_print:
-            print("************* layer %i ***************" % n_layer)
-            print(conv_mode, ss, n_layer, kshp, nkern)
-        # actual values
-        w = global_rng.random(np.r_[nkern, imshp[0], kshp])
-        w_flip = flip(w, kshp).reshape(w.shape)
-        # manual implementation
-        # check first stage
-        padimg = imgval
-        if conv_mode == "full":
-            padimg_shp = np.array(imshp[1:]) + 2 * (np.array(kshp) - np.array([1, 1]))
-            padimg = np.zeros(np.r_[bsize, imshp[0], padimg_shp])
-            padimg[
-                :, :, kshp[0] - 1 : -kshp[0] + 1, kshp[1] - 1 : -kshp[1] + 1
-            ] = imgval
-        outshp = np.hstack(
-            (nkern, ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode))
-        )
-        time1 = time.perf_counter()
-        outval = np.zeros(np.r_[bsize, outshp])
-        if validate:
-            # causes an atexit problem
-            try:
-                from scipy.signal.signaltools import _bvalfromboundary, _valfrommode
-                from scipy.signal.sigtools import _convolve2d
-            except ImportError:
-                from scipy.signal._signaltools import _bvalfromboundary, _valfrommode
-                from scipy.signal._sigtools import _convolve2d
-            val = _valfrommode(conv_mode)
-            bval = _bvalfromboundary("fill")
-            for b in range(bsize):  # loop over batches
-                for n in range(nkern):  # loop over filters
-                    for i in range(imshp[0]):  # loop over input feature maps
-                        outval[b, n, ...] += _convolve2d(
-                            imgval[b, i, ...], w_flip[n, i, ...], 1, val, bval, 0
-                        )[0 :: ss[0], 0 :: ss[1]]
-            ntot += time.perf_counter() - time1
-        # ConvOp
-        if unroll_patch and not unroll_patch_size:
-            conv_op = ConvOp(
-                dx=ss[0],
-                dy=ss[1],
-                output_mode=conv_mode,
-                unroll_patch=unroll_patch,
-                verbose=verbose,
-            )(inputs4, kerns4)
-        else:
-            conv_op = ConvOp(
-                imshp,
-                kshp,
-                nkern,
-                bsize,
-                ss[0],
-                ss[1],
-                conv_mode,
-                unroll_batch=unroll_batch,
-                unroll_kern=unroll_kern,
-                unroll_patch=unroll_patch,
-                verbose=verbose,
-            )(inputs4, kerns4)
-        # l1shp = np.hstack((nkern,
-        #                ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)))
-        propup2 = function([inputs4, kerns4], conv_op)
-        propup3 = function([inputs4, kerns4], conv_op, mode=Mode(linker="py"))
-        time1 = time.perf_counter()
-        for i in range(repeat):
-            hidval2_ = propup2(imgval, w_flip)
-        hidval2 = hidval2_  # [:,:,0::ss[0],0::ss[1]]
-        tctot += time.perf_counter() - time1
-        if conv_op_py:
-            time1 = time.perf_counter()
-            for i in range(repeat):
-                hidval3_ = propup3(imgval, w_flip)
-            hidval3 = hidval3_  # [:,:,0::ss[0],0::ss[1]]
-            tpytot += time.perf_counter() - time1
-            assert (np.abs(hidval2 - hidval3) < 1e-5).all()
-        else:
-            tpytot += 0
-        if validate:
-            temp = np.abs(outval - hidval2)
-            assert (temp < 1e-5).all()
-        if validate and conv_op_py:
-            temp = np.abs(outval - hidval3)
-            assert (temp < 1e-5).all()
-        imshp = tuple(outshp)
-        imgval = outval.reshape(bsize, outshp[0], outshp[1], outshp[2])
-    return tctot, tpytot, ntot
-def exec_multilayer_conv_nnet(
-    conv_mode,
-    ss,
-    bsize,
-    imshp,
-    kshps,
-    nkerns,
-    unroll_batch=0,
-    unroll_kern=0,
-    img=None,
-    do_print=True,
-    repeat=1,
-    unroll_patch=False,
-    unroll_patch_size=False,
-    verbose=0,
-):
-    if img is None:
-        img = dmatrix()
-    # build actual input images
-    imgval = global_rng.random((bsize, imshp[0], imshp[1], imshp[2]))
-    a = dmatrix()
-    kerns = [a for i in nkerns]
-    inputs4 = dmatrix4()
-    kerns4 = dmatrix4()
-    # for each layer
-    ntot = 0
-    tctot = 0
-    tpytot = 0
-    for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))):
-        if do_print:
-            print("************* layer %i ***************" % n_layer)
-            print(conv_mode, ss, n_layer, kshp, nkern)
-        # actual values
-        w = global_rng.random(np.r_[nkern, imshp[0], kshp])
-        w_flip = flip(w, kshp).reshape(w.shape)
-        outshp = np.hstack(
-            (nkern, ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode))
-        )
-        time1 = time.perf_counter()
-        # outval = np.zeros(np.r_[bsize, outshp])
-        # ConvOp
-        if unroll_patch and not unroll_patch_size:
-            conv_op = ConvOp(
-                dx=ss[0],
-                dy=ss[1],
-                output_mode=conv_mode,
-                unroll_patch=unroll_patch,
-                verbose=verbose,
-            )(inputs4, kerns4)
-        else:
-            conv_op = ConvOp(
-                imshp,
-                kshp,
-                nkern,
-                bsize,
-                ss[0],
-                ss[1],
-                conv_mode,
-                unroll_batch=unroll_batch,
-                unroll_kern=unroll_kern,
-                unroll_patch=unroll_patch,
-                verbose=verbose,
-            )(inputs4, kerns4)
-        # l1shp = np.hstack((nkern,
-        #                ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)))
-        propup2 = function([inputs4, kerns4], conv_op)
-        time1 = time.perf_counter()
-        for i in range(repeat):
-            propup2(imgval, w_flip)
-        tctot += time.perf_counter() - time1
-        imshp = tuple(outshp)
-        # imgval = outval.reshape(bsize, outshp[0], outshp[1], outshp[2])
-    return tctot, tpytot, ntot
-def speed_multilayer_conv():
-    # calculate the speed up of different combination of unroll
-    # put the parameter to the same you will try.
-    # validate = False  # we don't validate the result to have it much faster!
-    repeat = 3
-    verbose = 1
-    unroll_batch = [1, 2, 3, 4, 5, 6, 10]  # 15, 30, 60 always much slower
-    unroll_kern = [1, 2, 3, 4, 5, 6, 10]  # 15, 30, 60 always much slower
-    # unroll_batch = [1,4,5]
-    # unroll_kern = [1,4,5]
-    # unroll_batch = [1,4]
-    # unroll_kern = [1,4]
-    # unroll_patch = [True, False]
-    bsize = 60  # batch size
-    imshp_start = (1, 48, 48)  # un square shape to test more corner case.
-    kshps = ([11, 12],)  # un square shape to test more corner case.
-    nkerns = [60]  # per output pixel
-    ssizes = [
-        (1, 1),
-    ]  # (1,1)]#(2,2) bugged
-    convmodes = ["valid", "full"]
-    # do_convolve2 = False
-    a = dmatrix()
-    kerns = [a for i in nkerns]
-    assert len(kshps) == len(nkerns) == len(kerns)
-    timing = np.zeros(
-        (len(unroll_batch), len(unroll_kern), 3, len(convmodes) * len(ssizes))
-    )
-    t_b_k = []
-    # calculate the timing with unrolling
-    print("time unroll batch kern")
-    best = []
-    worst = []
-    t_ = []
-    for unroll_b, n_b in zip(unroll_batch, range(len(unroll_batch))):
-        for unroll_k, n_k in zip(unroll_kern, range(len(unroll_kern))):
-            t_b_k.append(str(unroll_b) + "/" + str(unroll_k))
-            if not t_:
-                tctot, tpytot, ntot = [], [], []
-                for conv_mode, n_mode in zip(convmodes, range(len(convmodes))):
-                    for ss, n_ss in zip(ssizes, range(len(ssizes))):
-                        # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False)
-                        tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
-                            conv_mode,
-                            ss,
-                            bsize,
-                            imshp_start,
-                            kshps,
-                            nkerns,
-                            unroll_batch=unroll_b,
-                            unroll_kern=unroll_k,
-                            verbose=verbose,
-                            do_print=False,
-                            repeat=repeat,
-                        )
-                        tctot += [tctot_]
-                        tpytot += [tpytot_]
-                        ntot += [ntot_]
-                if unroll_b == 4 and unroll_k == 4:
-                    # print "unroll 4/4",tctot
-                    best = tctot
-                if unroll_b == 1 and unroll_k == 1:
-                    # print "unroll 1/1",tctot
-                    worst = tctot
-                timing[n_b, n_k] = [
-                    tctot,
-                    tpytot,
-                    ntot,
-                ]  # [sum(tctot), sum(tpytot), sum(ntot)]
-    if not t_:
-        t = timing[:, :, 0, :]  # We select only the c timing.
-    else:
-        t = t_
-    t = np.asarray(t)
-    # calculate the old timing
-    print("time old version")
-    tctot, tpytot, ntot = [], [], []
-    tctot_ = []
-    if not tctot_:
-        for conv_mode, n_mode in zip(convmodes, range(len(convmodes))):
-            for ss, n_ss in zip(ssizes, range(len(ssizes))):
-                # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False)
-                tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
-                    conv_mode,
-                    ss,
-                    bsize,
-                    imshp_start,
-                    kshps,
-                    nkerns,
-                    unroll_batch=0,
-                    unroll_kern=0,
-                    verbose=verbose,
-                    do_print=False,
-                    repeat=repeat,
-                )
-                tctot += [tctot_]
-                tpytot += [tpytot_]
-                ntot += [ntot_]
-    else:
-        tctot = np.asarray(tctot_)
-    print(f"old code timing {sum(tctot):.3f}s", tctot)
-    best = np.asarray(best)
-    worst = np.asarray(worst)
-    print("timing for unrolled version")
-    print("unroll_batch/unroll_kern valid_mode full_mode")
-    for n_b in range(len(unroll_batch)):
-        for n_k in range(len(unroll_kern)):
-            print((unroll_batch[n_b], unroll_kern[n_k]) + tuple(t[n_b, n_k]), ",")
-    # t_detail = t
-    t = t.sum(axis=2)
-    print(
-        f"max {t.max():.3f}s",
-        "max param(batch unloop size/kernel unloop size)",
-        t_b_k[t.argmax()],
-    )
-    print(
-        f"min {t.min():.3f}s",
-        "min param(batch unloop size/kernel unloop size)",
-        t_b_k[t.argmin()],
-    )
-    print(
-        f"speedup vs (1/1){t.max() / t.min():.3f}x, vs old {sum(tctot) / t.min():.3f}x"
-    )
-    print(worst / best, tctot / best)
-    # calculate the timing of unroll_patch
-    print("time unroll_patch")
-    tctot_patch = []
-    tctot_patch_size = []
-    for conv_mode, n_mode in zip(convmodes, range(len(convmodes))):
-        for ss, n_ss in zip(ssizes, range(len(ssizes))):
-            # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False)
-            tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
-                conv_mode,
-                ss,
-                bsize,
-                imshp_start,
-                kshps,
-                nkerns,
-                unroll_batch=0,
-                unroll_kern=0,
-                unroll_patch=True,
-                verbose=verbose,
-                do_print=False,
-                repeat=repeat,
-            )
-            tctot_patch += [tctot_]
-            # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True)
-            tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
-                conv_mode,
-                ss,
-                bsize,
-                imshp_start,
-                kshps,
-                nkerns,
-                unroll_batch=0,
-                unroll_kern=0,
-                unroll_patch=True,
-                verbose=verbose,
-                do_print=False,
-                unroll_patch_size=True,
-                repeat=repeat,
-            )
-            tctot_patch_size += [tctot_]
-    t_patch = sum(tctot_patch)
-    print("unroll_patch without shape time", tctot_patch)
-    print(
-        f"speedup vs (1/1){t.max() / t_patch:.3f}x, vs old {sum(tctot) / t_patch:.3f}x"
-    )
-    print(best / tctot_patch, worst / tctot_patch)
-    t_patch_size = sum(tctot_patch_size)
-    print("unroll_patch with shape time", tctot_patch_size)
-    print(
-        "speedup vs (1/1)%.3fx, vs old %.3fx"
-        % (t.max() / t_patch_size, sum(tctot) / t_patch_size)
-    )
-    print(best / tctot_patch_size, worst / tctot_patch_size)
-    return
-if __name__ == "__main__":
-    speed_multilayer_conv()
--- a/tests/tensor/nnet/test_basic.py
+++ b/tests/tensor/nnet/test_basic.py
-from contextlib import ExitStack as does_not_raise
-import numpy as np
-import pytest
-import scipy.special as sp
-import pytensor
-import pytensor.tensor as at
-from pytensor.compile.mode import OPT_FAST_RUN, optdb
-from pytensor.configdefaults import config
-from pytensor.gradient import grad
-from pytensor.graph.fg import FunctionGraph
-from pytensor.graph.rewriting.basic import check_stack_trace
-from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise
-from pytensor.tensor.math import (
-    Argmax,
-    add,
-    argmax,
-    dot,
-    exp,
-    log,
-    max_and_argmax,
-    mean,
-    sigmoid,
-)
-from pytensor.tensor.math import sum as at_sum
-from pytensor.tensor.math import tanh
-from pytensor.tensor.nnet.basic import (
-    CrossentropyCategorical1Hot,
-    CrossentropyCategorical1HotGrad,
-    CrossentropySoftmax1HotWithBiasDx,
-    CrossentropySoftmaxArgmax1HotWithBias,
-    Prepend_scalar_constant_to_each_row,
-    Prepend_scalar_to_each_row,
-    Softmax,
-    SoftmaxGrad,
-    SoftmaxWithBias,
-    binary_crossentropy,
-    categorical_crossentropy,
-    confusion_matrix,
-    crossentropy_categorical_1hot,
-    crossentropy_softmax_1hot,
-    crossentropy_softmax_1hot_with_bias,
-    crossentropy_softmax_1hot_with_bias_dx,
-    crossentropy_softmax_argmax_1hot_with_bias,
-    elu,
-    h_softmax,
-    relu,
-    selu,
-    sigmoid_binary_crossentropy,
-    softmax,
-    softmax_grad_legacy,
-    softmax_legacy,
-    softmax_with_bias,
-    softsign,
-)
-from pytensor.tensor.shape import shape_padleft
-from pytensor.tensor.subtensor import AdvancedSubtensor
-from pytensor.tensor.type import (
-    dmatrix,
-    dvector,
-    fmatrix,
-    fvector,
-    ivector,
-    lvector,
-    matrix,
-    scalar,
-    tensor3,
-    tensor4,
-    vector,
-    vectors,
-)
-from tests import unittest_tools as utt
-from tests.tensor.utils import (
-    _good_broadcast_unary_normal_float_no_complex,
-    check_floatX,
-    makeBroadcastTester,
-    upcast_int8_nfunc,
-)
-def softmax_graph(c):
-    return exp(c) / exp(c).sum(axis=-1, keepdims=True)
-def valid_axis_tester(Op):
-    with pytest.raises(TypeError):
-        Op(1.5)
-    x = [tensor3()] * Op.nin
-    with does_not_raise():
-        Op(2)(*x)
-    with pytest.raises(ValueError):
-        Op(3)(*x)
-    with does_not_raise():
-        Op(-3)(*x)
-    with pytest.raises(ValueError):
-        Op(-4)(*x)
-class TestSoftmaxWithBias(utt.InferShapeTester):
-    def test_basic(self):
-        def f(a, b):
-            return softmax_with_bias(a, b)[:, 0]
-        rng = np.random.default_rng(utt.fetch_seed())
-        utt.verify_grad(f, [rng.random((3, 4)), rng.random(4)])
-        def f(a, b):
-            return softmax_with_bias(a, b)[:, 1]
-        utt.verify_grad(f, [rng.random((3, 4)), rng.random(4)])
-        def f(a, b):
-            return softmax_with_bias(a, b)[:, 2]
-        utt.verify_grad(f, [rng.random((3, 4)), rng.random(4)])
-        def f(a, b):
-            return softmax_with_bias(a, b)[:, 3]
-        utt.verify_grad(f, [rng.random((3, 4)), rng.random(4)])
-    def test_broadcast(self):
-        """
-        Test that we don't raise an error during rewriting for no good reason
-        as `softmax_with_bias` don't support correctly some/all broadcasted
-        inputs pattern.
-        """
-        initial_W = np.asarray(
-            [[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]],
-            dtype=config.floatX,
-        )
-        W = pytensor.shared(value=initial_W, name="W")
-        vbias = pytensor.shared(value=0.1, name="vbias")  # 0.01
-        hid = vector("hid")
-        f = pytensor.function([hid], softmax_legacy(dot(hid, W.T) + vbias))
-        ops = [node.op for node in f.maker.fgraph.toposort()]
-        assert softmax_with_bias not in ops
-        assert softmax_legacy in ops
-        f([0, 1, 0])
-        # print f.maker.fgraph.toposort()
-    def test_softmax_with_bias_trace(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        a = pytensor.shared(rng.standard_normal((3,)).astype(config.floatX))
-        b = pytensor.shared(np.float32(rng.standard_normal()))
-        sm = softmax(a + b)
-        f = pytensor.function([], sm)
-        assert check_stack_trace(f, ops_to_check="last")
-    def test_infer_shape(self):
-        admat = matrix()
-        advec = vector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((3, 4)).astype(config.floatX)
-        advec_val = rng.random(4).astype(config.floatX)
-        self._compile_and_check(
-            [admat, advec],
-            [SoftmaxWithBias()(admat, advec)],
-            [admat_val, advec_val],
-            SoftmaxWithBias,
-        )
-class TestCrossEntropySoftmax1Hot:
-    def test_basic(self):
-        y_idx = [0, 1, 3]
-        def f(a, b):
-            return crossentropy_softmax_1hot_with_bias(a, b, y_idx)[0]
-        rng = np.random.default_rng(utt.fetch_seed())
-        utt.verify_grad(f, [rng.random((3, 4)), rng.random(4)])
-        y_idx = [0, 1, 3]
-        def f(a):
-            return crossentropy_softmax_1hot(a, y_idx)[0]
-        utt.verify_grad(f, [rng.random((3, 4))])
-    def test_vector(self):
-        y_idx = [3]
-        def f(a):
-            return crossentropy_softmax_1hot(shape_padleft(a), y_idx)[0]
-        rng = np.random.default_rng(utt.fetch_seed())
-        utt.verify_grad(f, [rng.random((4,))])
-    def test_vectors(self):
-        y_idx = [3]
-        def f(a, b):
-            return crossentropy_softmax_1hot(shape_padleft(a) + b, y_idx)[0]
-        rng = np.random.default_rng(utt.fetch_seed())
-        utt.verify_grad(f, [rng.random((4,)), rng.random(4)])
-class TestCrossEntropySoftmax1HotWithBiasDx(utt.InferShapeTester):
-    def test_basic(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        def ff(class_dtype):
-            def f(sm):
-                # Class indices
-                y = rng.integers(low=0, high=5, size=10).astype(class_dtype)
-                return crossentropy_softmax_1hot_with_bias_dx(
-                    rng.random(10),
-                    sm,
-                    y,  # Gradient w.r.t. NLL.  # Softmax output.
-                )
-            return f
-        # Build a random softmax output whose rows sum to 1.
-        softmax_output = rng.random((10, 5))
-        softmax_output /= softmax_output.sum(axis=1).reshape(10, 1)
-        for dtype in ["uint8", "int8", "uint64", "int64"]:
-            utt.verify_grad(ff(dtype), [softmax_output])
-    def test_basic_2(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        softmax_output = rng.random((10, 5))
-        softmax_output /= softmax_output.sum(axis=1).reshape(10, 1)
-        def f(dy):
-            return crossentropy_softmax_1hot_with_bias_dx(
-                dy, softmax_output, rng.integers(low=0, high=5, size=10)
-            )
-        utt.verify_grad(f, [rng.random(10)])
-    def test_infer_shape(self):
-        admat = matrix()
-        advec = vector()
-        alvec = lvector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((10, 5)).astype(config.floatX)
-        admat_val /= admat_val.sum(axis=1).reshape(10, 1)
-        advec_val = rng.random(10).astype(config.floatX)
-        alvec_val = rng.integers(low=0, high=5, size=10)
-        self._compile_and_check(
-            [advec, admat, alvec],
-            [CrossentropySoftmax1HotWithBiasDx()(advec, admat, alvec)],
-            [advec_val, admat_val, alvec_val],
-            CrossentropySoftmax1HotWithBiasDx,
-        )
-    def test_neg_idx(self):
-        admat = matrix()
-        advec = vector()
-        alvec = lvector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((10, 5)).astype(config.floatX)
-        admat_val /= admat_val.sum(axis=1).reshape(10, 1)
-        advec_val = rng.random(10).astype(config.floatX)
-        alvec_val = rng.integers(low=0, high=5, size=10)
-        alvec_val[1] = -1
-        out = CrossentropySoftmax1HotWithBiasDx()(advec, admat, alvec)
-        f = pytensor.function([advec, admat, alvec], out)
-        with pytest.raises(ValueError):
-            f(advec_val, admat_val, alvec_val)
-class TestCrossEntropySoftmaxArgmax1HotWithBias(utt.InferShapeTester):
-    def setup_method(self):
-        self.op = crossentropy_softmax_argmax_1hot_with_bias
-        super().setup_method()
-    def test_grads(self):
-        n_classes = 5
-        n_samples = 3
-        rng = np.random.default_rng(utt.fetch_seed())
-        # First test gradient when getting a gradient on the NLL output.
-        def grad_on_nll_dtype(dtype):
-            def grad_on_nll(x, b):
-                y_idx = rng.integers(low=0, high=n_classes, size=n_samples).astype(
-                    dtype
-                )
-                return self.op(x, b, y_idx=y_idx)[0]
-            return grad_on_nll
-        for dtype in ["uint8", "int8", "uint64", "int64"]:
-            utt.verify_grad(
-                grad_on_nll_dtype(dtype),
-                [
-                    rng.random((n_samples, n_classes)),
-                    rng.random(n_classes),
-                ],
-            )
-        # Then test gradient when getting a gradient on the softmax output.
-        def grad_on_softmax(x, b):
-            return self.op(
-                x,
-                b,
-                y_idx=rng.integers(low=0, high=n_classes, size=n_samples),
-            )[1]
-        utt.verify_grad(
-            grad_on_softmax,
-            [rng.random((n_samples, n_classes)), rng.random(n_classes)],
-        )
-    def test_infer_shape(self):
-        admat = matrix()
-        advec = vector()
-        alvec = lvector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((3, 5)).astype(config.floatX)
-        advec_val = rng.random(5).astype(config.floatX)
-        alvec_val = rng.integers(low=0, high=5, size=3)
-        self._compile_and_check(
-            [admat, advec, alvec],
-            CrossentropySoftmaxArgmax1HotWithBias()(admat, advec, alvec),
-            [admat_val, advec_val, alvec_val],
-            CrossentropySoftmaxArgmax1HotWithBias,
-        )
-    def test_neg_idx(self):
-        admat = matrix()
-        advec = vector()
-        alvec = lvector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((3, 5)).astype(config.floatX)
-        advec_val = rng.random(5).astype(config.floatX)
-        alvec_val = rng.integers(low=0, high=5, size=3)
-        alvec_val[1] = -1
-        out = CrossentropySoftmaxArgmax1HotWithBias()(admat, advec, alvec)
-        f = pytensor.function([admat, advec, alvec], out)
-        with pytest.raises(ValueError):
-            f(admat_val, advec_val, alvec_val)
-class TestPrepend(utt.InferShapeTester):
-    def test_prepend_constant(self):
-        x = matrix("x")
-        y = Prepend_scalar_constant_to_each_row(4.0)(x)
-        f = pytensor.function([x], y)
-        rng = np.random.default_rng(utt.fetch_seed())
-        m = rng.random((3, 5)).astype(config.floatX)
-        my = f(m)
-        assert my.shape == (3, 6)
-        assert np.all(my[:, 0] == 4.0)
-    def test_prepend_basic(self):
-        """Test basic functionality."""
-        x = matrix("x")
-        y = Prepend_scalar_to_each_row()(5.0, x)
-        f = pytensor.function([x], y)
-        m = np.ones((3, 5), dtype="float32")
-        my = f(m)
-        assert my.shape == (3, 6)
-        assert np.all(my[:, 0] == 5.0)
-    def test_infer_shape(self):
-        admat = matrix()
-        adscal = scalar()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((3, 5)).astype(config.floatX)
-        adscal_val = np.asarray(rng.random(), dtype=config.floatX).item()
-        self._compile_and_check(
-            [admat],
-            [Prepend_scalar_constant_to_each_row(adscal_val)(admat)],
-            [admat_val],
-            Prepend_scalar_constant_to_each_row,
-        )
-        self._compile_and_check(
-            [adscal, admat],
-            [Prepend_scalar_to_each_row()(adscal, admat)],
-            [adscal_val, admat_val],
-            Prepend_scalar_to_each_row,
-        )
-class TestCrossEntropyCategorical1HotGrad(utt.InferShapeTester):
-    def test_infer_shape(self):
-        advec = vector()
-        admat = matrix()
-        alvec = lvector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        advec_val = rng.random(3).astype(config.floatX)
-        admat_val = rng.random((3, 2)).astype(config.floatX)
-        alvec_val = [0, 1, 0]
-        self._compile_and_check(
-            [advec, admat, alvec],
-            [CrossentropyCategorical1HotGrad()(advec, admat, alvec)],
-            [advec_val, admat_val, alvec_val],
-            CrossentropyCategorical1HotGrad,
-        )
-class TestCrossEntropyCategorical1Hot(utt.InferShapeTester):
-    def test_input_validation(self):
-        with pytest.raises(TypeError, match="Matrix.*"):
-            crossentropy_categorical_1hot(vector(), lvector())
-        with pytest.raises(TypeError, match="Integer.*"):
-            crossentropy_categorical_1hot(matrix(), vector())
-    def test_grad(self):
-        x = matrix("x")
-        one_of_n = lvector("one_of_n")
-        op = crossentropy_categorical_1hot
-        xe = op(x, one_of_n)
-        f = pytensor.function([x, one_of_n], xe)
-        x_val = np.asarray([[0.4, 0.6, 0.0], [0.1, 0.8, 0.1]], dtype=config.floatX)
-        xe_val = f(x_val, [0, 1])
-        assert np.allclose(xe_val, -np.log([0.4, 0.8]))
-        def oplike(x):
-            return op(x, [0, 1])
-        rng = np.random.default_rng(utt.fetch_seed())
-        utt.verify_grad(oplike, [x_val], rng=rng)
-    def test_infer_shape(self):
-        admat = matrix()
-        alvec = lvector()
-        rng = np.random.default_rng(utt.fetch_seed())
-        admat_val = rng.random((3, 2)).astype(config.floatX)
-        alvec_val = [0, 1, 0]
-        self._compile_and_check(
-            [admat, alvec],
-            [CrossentropyCategorical1Hot()(admat, alvec)],
-            [admat_val, alvec_val],
-            CrossentropyCategorical1Hot,
-        )
-    def test_softmax_rewrites(self):
-        x = matrix("x")
-        one_of_n = lvector("one_of_n")
-        op = crossentropy_categorical_1hot
-        # xe = op(x, one_of_n)
-        fgraph = FunctionGraph([x, one_of_n], [op(softmax_legacy(x), one_of_n)])
-        assert fgraph.outputs[0].owner.op == op
-        optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-        assert fgraph.outputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
-    def test_softmax_rewrites_w_bias(self):
-        x = matrix("x")
-        b = vector("b")
-        one_of_n = lvector("one_of_n")
-        op = crossentropy_categorical_1hot
-        fgraph = FunctionGraph([x, b, one_of_n], [op(softmax_legacy(x + b), one_of_n)])
-        assert fgraph.outputs[0].owner.op == op
-        optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-        assert len(fgraph.toposort()) == 1
-        assert fgraph.outputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
-    def test_softmax_rewrites_w_bias2(self):
-        x = matrix("x")
-        b = vector("b")
-        c = vector("c")
-        one_of_n = lvector("one_of_n")
-        op = crossentropy_categorical_1hot
-        fgraph = FunctionGraph(
-            [x, b, c, one_of_n], [op(softmax_legacy(add(x, b, c)), one_of_n)]
-        )
-        assert fgraph.outputs[0].owner.op == op
-        optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-        assert len(fgraph.toposort()) == 2
-        assert fgraph.outputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
-    def test_softmax_grad_rewrites(self):
-        x = matrix("x")
-        one_of_n = lvector("one_of_n")
-        op = crossentropy_categorical_1hot
-        xe = op(softmax_legacy(x), one_of_n)
-        sum_xe = at_sum(xe)
-        g_x = grad(sum_xe, x)
-        fgraph = FunctionGraph([x, one_of_n], [g_x])
-        assert check_stack_trace(
-            fgraph,
-            ops_to_check=[crossentropy_softmax_1hot_with_bias_dx, softmax_legacy],
-        )
-        optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-        ops = {node.op for node in fgraph.toposort()}
-        assert crossentropy_softmax_argmax_1hot_with_bias not in ops
-        assert crossentropy_softmax_1hot_with_bias_dx in ops
-        assert softmax_legacy in ops
-        assert softmax_grad_legacy not in ops
-    def test_get_rid_of_advanced_indexing_version_of_xent(self):
-        x = matrix("x")
-        b = vector("b")
-        y = lvector("y")
-        # Basic case
-        expressions = [
-            at_sum(-log(softmax(x)[at.arange(y.shape[0]), y])),
-            -at_sum(log(softmax(x)[at.arange(y.shape[0]), y])),
-            -at_sum(log(softmax(x))[at.arange(y.shape[0]), y]),
-            at_sum(-log(softmax(x))[at.arange(y.shape[0]), y]),
-        ]
-        for expr in expressions:
-            fgraph = FunctionGraph([x, y], [expr])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 4
-            assert crossentropy_softmax_argmax_1hot_with_bias in ops
-            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]
-            # Also verify the gradient wrt x
-            fgraph = FunctionGraph([x, y], [grad(expr, x)])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 2
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_legacy in ops
-            assert softmax_grad_legacy not in ops
-        # Test that a biased softmax is rewritten correctly
-        bias_expressions = [
-            at_sum(-log(softmax(x + b)[at.arange(y.shape[0]), y])),
-            -at_sum(log(softmax(b + x)[at.arange(y.shape[0]), y])),
-            -at_sum(log(softmax(x + b))[at.arange(y.shape[0]), y]),
-            at_sum(-log(softmax(b + x))[at.arange(y.shape[0]), y]),
-        ]
-        for expr in bias_expressions:
-            fgraph = FunctionGraph([x, b, y], [expr, x])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 2  # [big_op, sum]
-            assert crossentropy_softmax_argmax_1hot_with_bias in ops
-            fgraph = FunctionGraph([x, b, y], [grad(expr, x)])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 2
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_with_bias in ops
-            assert softmax_grad_legacy not in ops
-        # Test that using "mean" instead of sum works, too
-        mean_expressions = [
-            mean(-log(softmax(x)[at.arange(y.shape[0]), y])),
-            -mean(log(softmax(x)[at.arange(y.shape[0]), y])),
-            -mean(log(softmax(x))[at.arange(y.shape[0]), y]),
-            mean(-log(softmax(x))[at.arange(y.shape[0]), y]),
-        ]
-        for expr in mean_expressions:
-            fgraph = FunctionGraph([x, y], [expr])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 6
-            assert crossentropy_softmax_argmax_1hot_with_bias in ops
-            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]
-            fgraph = FunctionGraph([x, y], [grad(expr, x)])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 5
-            # there's an extra dimshuffle in there
-            # but I can't think of a good rule to get rid of it
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_legacy in ops
-            assert softmax_grad_legacy not in ops
-        mean_bias_expressions = [
-            mean(-log(softmax(x + b)[at.arange(y.shape[0]), y])),
-            -mean(log(softmax(b + x)[at.arange(y.shape[0]), y])),
-            -mean(log(softmax(x + b))[at.arange(y.shape[0]), y]),
-            mean(-log(softmax(b + x))[at.arange(y.shape[0]), y]),
-        ]
-        for expr in mean_bias_expressions:
-            fgraph = FunctionGraph([x, b, y], [expr])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 4
-            assert crossentropy_softmax_argmax_1hot_with_bias in ops
-            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]
-            fgraph = FunctionGraph([x, b, y], [grad(expr, x)])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 5
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_with_bias in ops
-            assert softmax_grad_legacy not in ops
-    def test_xent_thing_int32(self):
-        x = matrix("x")
-        y = lvector("y")
-        yi = at.cast(y, "int32")
-        expressions = [
-            at_sum(-log(softmax(x)[at.arange(yi.shape[0]), yi])),
-            -at_sum(log(softmax(x)[at.arange(yi.shape[0]), yi])),
-            -at_sum(log(softmax(x))[at.arange(yi.shape[0]), yi]),
-            at_sum(-log(softmax(x))[at.arange(yi.shape[0]), yi]),
-        ]
-        for expr in expressions:
-            fgraph = FunctionGraph([x, y], [expr])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 5
-            assert crossentropy_softmax_argmax_1hot_with_bias in ops
-            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]
-            # Also verify the gradient wrt x
-            fgraph = FunctionGraph([x, y], [grad(expr, x)])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            ops = [node.op for node in fgraph.toposort()]
-            assert len(ops) == 3
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_legacy in ops
-            assert softmax_grad_legacy not in ops
-    def test_crossentropy_softmax_1hot_with_bias_dxcale_cost(self):
-        x = matrix("x")
-        y = lvector("y")
-        a = scalar("a")
-        def validate_grad_graph(func):
-            # The graph of the gradient should not have softmaxgrad anymore
-            has_cx1hotdx = False
-            has_softmax = False
-            has_softmaxdx = False
-            for node in func.maker.fgraph.toposort():
-                if node.op == crossentropy_softmax_1hot_with_bias_dx:
-                    has_cx1hotdx = True
-                if node.op == softmax_legacy:
-                    has_softmax = True
-                if node.op == softmax_grad_legacy:
-                    has_softmaxdx = True
-            assert has_cx1hotdx
-            assert has_softmax
-            assert not has_softmaxdx
-        # Cases to test
-        expressions = [
-            a * at_sum(-log(softmax(x)[at.arange(y.shape[0]), y])),
-            -a * at_sum(log(softmax(x)[at.arange(y.shape[0]), y])),
-            a * (-at_sum(log(softmax(x)[at.arange(y.shape[0]), y]))),
-            a * at_sum(log(softmax(x)[at.arange(y.shape[0]), y])),
-            a * at_sum(-log(softmax(x))[at.arange(y.shape[0]), y]),
-            -a * at_sum(log(softmax(x))[at.arange(y.shape[0]), y]),
-            a * (-at_sum(log(softmax(x))[at.arange(y.shape[0]), y])),
-            a * at_sum(log(softmax(x))[at.arange(y.shape[0]), y]),
-            a * mean(-log(softmax(x)[at.arange(y.shape[0]), y])),
-            -a * mean(log(softmax(x)[at.arange(y.shape[0]), y])),
-            a * (-mean(log(softmax(x)[at.arange(y.shape[0]), y]))),
-            a * mean(log(softmax(x)[at.arange(y.shape[0]), y])),
-            a * mean(-log(softmax(x))[at.arange(y.shape[0]), y]),
-            -a * mean(log(softmax(x))[at.arange(y.shape[0]), y]),
-            a * (-mean(log(softmax(x))[at.arange(y.shape[0]), y])),
-            a * mean(log(softmax(x))[at.arange(y.shape[0]), y]),
-        ]
-        for expr in expressions:
-            fgraph = FunctionGraph([x, y, a], [expr])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            assert 5 <= len(fgraph.toposort()) <= 10
-            ops = {node.op for node in fgraph.toposort()}
-            assert crossentropy_softmax_argmax_1hot_with_bias in ops
-            assert softmax_legacy not in ops
-            # Verify the gradient wrt x
-            fgraph = FunctionGraph([x, y, a], [grad(expr, x)])
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            assert 3 <= len(fgraph.toposort()) <= 6
-            ops = {node.op for node in fgraph.toposort()}
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_legacy in ops
-            assert softmax_grad_legacy not in ops
-            # Verify the gradient when providing output gradient
-            fgraph = FunctionGraph(
-                [x, y, a], [grad(expr, x, known_grads={expr: a * x.sum()})]
-            )
-            optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-            assert 6 <= len(fgraph.toposort()) <= 8
-            ops = {node.op for node in fgraph.toposort()}
-            assert crossentropy_softmax_1hot_with_bias_dx in ops
-            assert softmax_legacy in ops
-            assert softmax_grad_legacy not in ops
-def test_argmax_pushdown():
-    x = matrix()
-    for sm in [softmax_graph, softmax_legacy]:
-        # test that the max_and_argmax is pushed down if the max is not used
-        out = max_and_argmax(sm(exp(tanh(sigmoid(x)))), axis=-1)[1]
-        fgraph = FunctionGraph([x], [out])
-        optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-        # print 'AFTER'
-        # for node in fgraph.toposort():
-        # print node.op
-        assert len(fgraph.toposort()) == 1
-        assert isinstance(fgraph.toposort()[0].op, Argmax)
-        assert check_stack_trace(fgraph, ops_to_check=Argmax)
-        x = matrix()
-        # test that the max_and_argmax is not pushed down if the max is used
-        out = max_and_argmax(sm(exp(tanh(sigmoid(x)))), axis=-1)[0]
-        fgraph = FunctionGraph([x], [out])
-        assert hasattr(fgraph.outputs[0].tag, "trace")
-        optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-        # print 'AFTER'
-        # for node in fgraph.toposort():
-        # print node.op
-        assert len(fgraph.toposort()) == 3
-        assert isinstance(fgraph.toposort()[0].op, Elemwise)
-        assert isinstance(fgraph.toposort()[1].op, Softmax)
-        assert isinstance(fgraph.toposort()[2].op, CAReduce)
-        assert isinstance(
-            fgraph.toposort()[2].op.scalar_op, pytensor.scalar.ScalarMaximum
-        )
-def test_argmax_pushdown_bias():
-    x = matrix()
-    b = vector()
-    out = argmax(softmax_with_bias(x, b), axis=-1)
-    fgraph = FunctionGraph([x, b], [out])
-    optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-    types_to_check = (DimShuffle, Elemwise, Argmax)
-    assert len(fgraph.toposort()) == 3
-    for i, type in enumerate(types_to_check):
-        assert isinstance(fgraph.toposort()[i].op, type)
-    assert check_stack_trace(fgraph, ops_to_check=types_to_check)
-    x = matrix()
-    b = vector()
-    out = max_and_argmax(softmax_with_bias(x, b), axis=-1)[0]
-    fgraph = FunctionGraph([x, b], [out])
-    optdb.query(OPT_FAST_RUN).rewrite(fgraph)
-    assert len(fgraph.toposort()) == 2
-    assert isinstance(fgraph.toposort()[0].op, SoftmaxWithBias)
-    assert isinstance(fgraph.toposort()[1].op, CAReduce)
-    assert isinstance(fgraph.toposort()[1].op.scalar_op, pytensor.scalar.ScalarMaximum)
-    assert check_stack_trace(fgraph, ops_to_check=(SoftmaxWithBias, CAReduce))
-def test_asymptotic_32():
-    """Test that our functions behave sensibly when huge values are present."""
-    # TODO: consider adding the rewrite of crossentropy into the current
-    # mode for the purpose of running this test
-    for dtype in "float32", "float64":
-        if dtype == "float32":
-            x = fmatrix()
-            x2 = fvector()
-        else:
-            x = dmatrix()
-            x2 = dvector()
-        y = lvector()
-        c = categorical_crossentropy(softmax(x + x2), y)
-        f = pytensor.function([x, y, x2], [c.sum(), grad(c.sum(), x)], mode="FAST_RUN")
-        xval = np.zeros((5, 5), dtype=dtype).astype(dtype)
-        x2val = np.zeros(5, dtype=xval.dtype).astype(dtype)
-        for i in range(100):
-            cval, gxval = f(xval, np.arange(5), x2val)
-            xval -= 100.3 * gxval
-        assert cval == 0  # no problem going to zero error
-        # what about when x gets really big?
-        xval = np.zeros((5, 5), dtype=dtype)
-        x2val = np.zeros(5, dtype=xval.dtype)
-        for i in range(100):
-            cval, gxval = f(xval, np.arange(5), x2val)
-            xval += 100000.3 * gxval
-        assert cval > 61750000
-        assert gxval[0, 0] == -1.0
-        assert gxval[0, 1] == 0.25
-class TestSoftmaxRewrite:
-    """
-    Test that expressions of softmax in terms of exponentiated things
-    divided by row sums are replaced by softmax expressions.
-    `Softmax_grad` isn't that interesting as an Op, but it has the signature
-    we look for when trying to insert `CrossEntropySoftmax` grad.  So, for
-    now, we add `softmax_grad` to graphs. In the future, we may modify the
-    `CrossEntropySoftmax` grad to look for the more basic pattern.
-    """
-    def setup_method(self):
-        self.mode = pytensor.compile.mode.get_default_mode()
-        self.mode = self.mode.including("canonicalize")
-    @pytest.mark.parametrize("axis", [None, 0, 1, -1, (0, 1)])
-    def test_basic(self, axis):
-        c = matrix()
-        if axis is None:
-            p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", "x")
-        elif axis == 0:
-            p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", 0)
-        elif axis == (0, 1):
-            p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", "x")
-        else:
-            p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle(0, "x")
-        # test that function contains softmax and no div.
-        f = pytensor.function([c], p_y, mode=self.mode)
-        assert check_stack_trace(f, ops_to_check=Softmax)
-        f_ops = [n.op for n in f.maker.fgraph.toposort()]
-        assert len(f_ops) == 1
-        assert isinstance(f_ops[0], Softmax)
-        rng = np.random.default_rng(utt.fetch_seed())
-        c_val = rng.random((3, 4)).astype(config.floatX)
-        assert np.allclose(f(c_val), sp.softmax(c_val, axis=axis))
-    @pytest.mark.parametrize("axis", [None, 0, 1, 2, -1, -2, -3, (0, 1, 2)])
-    def test_basic_keepdims(self, axis):
-        c = tensor3()
-        p_y = exp(c) / exp(c).sum(axis=axis, keepdims=True)
-        # test that function contains softmax and no div.
-        f = pytensor.function([c], p_y, mode=self.mode)
-        assert check_stack_trace(f, ops_to_check=Softmax)
-        f_ops = [n.op for n in f.maker.fgraph.toposort()]
-        assert len(f_ops) == 1
-        assert isinstance(f_ops[0], Softmax)
-        rng = np.random.default_rng(utt.fetch_seed())
-        c_val = rng.random((3, 4, 5)).astype(config.floatX)
-        assert np.allclose(f(c_val), sp.softmax(c_val, axis=axis))
-    @pytest.mark.skip(reason="Rewrite not enabled for the moment")
-    def test_grad(self):
-        c = matrix()
-        p_y = exp(c) / exp(c).sum(axis=1).dimshuffle(0, "x")
-        # test that function contains softmax and softmaxgrad
-        w = matrix()
-        g = pytensor.function([c, w], grad((p_y * w).sum(), c), mode=self.mode)
-        g_ops = [n.op for n in g.maker.fgraph.toposort()]
-        assert len(g_ops) == 2, g_ops
-        assert isinstance(g_ops[0], Softmax)
-        assert isinstance(g_ops[1], SoftmaxGrad)
-        rng = np.random.default_rng(utt.fetch_seed())
-        g(rng.random((3, 4)), rng.uniform(0.5, 1, (3, 4)))
-    def test_transpose_basic(self):
-        # this should be a transposed softmax
-        c = matrix()
-        p_y = exp(c) / exp(c).sum(axis=0)
-        # test that function contains softmax and no div.
-        f = pytensor.function([c], p_y, mode=self.mode)
-        f_ops = [n.op for n in f.maker.fgraph.toposort()]
-        assert len(f_ops) == 1
-        assert isinstance(f_ops[0], Softmax)
-    @pytest.mark.skip(reason="Rewrite not enabled for the moment")
-    def test_transpose_grad(self):
-        # this should be a transposed softmax
-        c = matrix()
-        p_y = exp(c) / exp(c).sum(axis=0)
-        # test that function contains softmax and no div.
-        g = pytensor.function([c], grad(p_y.sum(), c), mode=self.mode)
-        g_ops = [n.op for n in g.maker.fgraph.toposort()]
-        assert len(g_ops) == 2
-        assert isinstance(g_ops[0], Softmax)
-        assert isinstance(g_ops[1], SoftmaxGrad)
-    def test_1d_basic(self):
-        c = vector()
-        p_y = exp(c) / exp(c).sum()
-        # test that function contains softmax and no div.
-        f = pytensor.function([c], p_y, mode=self.mode)
-        f_ops = [n.op for n in f.maker.fgraph.toposort()]
-        assert len(f_ops) == 1
-        assert isinstance(f_ops[0], Softmax)
-    @pytest.mark.skip(reason="Rewrite not enabled for the moment")
-    def test_1D_grad(self):
-        c = vector()
-        p_y = exp(c) / exp(c).sum()
-        # test that function contains softmax and no div.
-        g = pytensor.function([c], grad(p_y.sum(), c), mode=self.mode)
-        g_ops = [n.op for n in g.maker.fgraph.toposort()]
-        assert len(g_ops) == 2
-        assert isinstance(g_ops[0], Softmax)
-        assert isinstance(g_ops[1], SoftmaxGrad)
-    @pytest.mark.parametrize(
-        "f",
-        [
-            lambda c: exp(c) / exp(c).sum(axis=0).dimshuffle(0, 1, "x"),
-            lambda c: exp(c) / exp(c).sum(axis=0).dimshuffle("x", 0, 1, "x"),
-            lambda c: exp(c) / exp(c).sum(axis=0).dimshuffle("x", 1, 0),
-            lambda c: exp(c) / exp(c).sum(axis=(0, 1), keepdims=True),
-        ],
-    )
-    def test_invalid_softmax_expressions(self, f):
-        # Test that graphs are not rewritten into a softmax when a dimshuffle
-        # swaps or adds extra dimensions, or when more than one but not all axis
-        # are summed over (which is not allowed by the Softmax Op but otherwise
-        # valid)
-        c = tensor3("c")
-        out = f(c)
-        f = pytensor.function([c], out, mode=self.mode)
-        f_ops = [n.op for n in f.maker.fgraph.toposort()]
-        assert len(f_ops) > 1
-        assert not any(isinstance(op, Softmax) for op in f_ops)
-def test_softmax_graph():
-    rng = np.random.default_rng(utt.fetch_seed())
-    x = pytensor.shared(rng.normal(size=(3, 4)))
-    def f(inputs):
-        y = softmax_graph(x)
-        return pytensor.grad(None, x, known_grads={y: inputs})
-    utt.verify_grad(f, [rng.random((3, 4))])
-def test_grad_softmax_grad():
-    rng = np.random.default_rng(utt.fetch_seed())
-    x = pytensor.shared(rng.normal(size=(3, 4)))
-    def f(inputs):
-        y = softmax_legacy(x)
-        return pytensor.grad(None, x, known_grads={y: inputs})
-    utt.verify_grad(f, [rng.random((3, 4))])
-def test_relu():
-    x = matrix("x")
-    rng = np.random.default_rng(utt.fetch_seed())
-    X = rng.standard_normal((20, 30)).astype(config.floatX)
-    # Test the base case, without custom alpha value
-    y = relu(x).eval({x: X})
-    assert np.allclose(y, np.maximum(X, 0))
-    # Test for different constant alpha values (also outside of [0, 1])
-    for alpha in 0, 0.3, 1, 2, -0.3, -1, -2:
-        y = relu(x, alpha).eval({x: X})
-        assert np.allclose(y, np.where(X > 0, X, alpha * X))
-    # Test for variable alpha (scalar, vector and matrix)
-    for alpha in scalar(), vector(), matrix():
-        # Create value for alpha (correct ndim and broadcastable against X)
-        A = np.array(
-            rng.standard_normal(X.shape[::-1][: alpha.ndim][::-1]), dtype=config.floatX
-        )
-        y = relu(x, alpha).eval({x: X, alpha: A})
-        assert np.allclose(y, np.where(X > 0, X, A * X), rtol=3e-5)
-        # Test that an alpha of type `ndarray` doesn't generate an upcast
-        x = matrix("x", dtype="float32")
-        X = rng.standard_normal((20, 30)).astype("float32")
-        alpha = np.asarray(0.123, dtype="float32")
-        y = relu(x, alpha).eval({x: X})
-        assert np.allclose(y, np.where(X > 0, X, alpha * X))
-        assert y.dtype == "float32"
-def test_h_softmax():
-    """Tests the output dimensions of the `h_softmax` when a target is provided or not."""
-    input_size = 4
-    batch_size = 2
-    h_softmax_level1_size = 5
-    h_softmax_level2_size = 3
-    output_size = h_softmax_level1_size * h_softmax_level2_size
-    rng = np.random.default_rng(utt.fetch_seed())
-    # First level of h_softmax
-    W1 = np.asarray(
-        rng.normal(size=(input_size, h_softmax_level1_size)), dtype=config.floatX
-    )
-    W1 = pytensor.shared(W1)
-    b1 = pytensor.shared(
-        np.asarray(np.zeros((h_softmax_level1_size,)), dtype=config.floatX)
-    )
-    # Second level of h_softmax
-    W2 = np.asarray(
-        rng.normal(size=(h_softmax_level1_size, input_size, h_softmax_level2_size)),
-        dtype=config.floatX,
-    )
-    W2 = pytensor.shared(W2)
-    b2 = pytensor.shared(
-        np.asarray(
-            np.zeros((h_softmax_level1_size, h_softmax_level2_size)),
-            dtype=config.floatX,
-        )
-    )
-    x = matrix("x")
-    y = ivector("y")
-    # This only computes the output corresponding to the target
-    y_hat_tg = h_softmax(
-        x,
-        batch_size,
-        output_size,
-        h_softmax_level1_size,
-        h_softmax_level2_size,
-        W1,
-        b1,
-        W2,
-        b2,
-        y,
-    )
-    # This computes all the outputs
-    y_hat_all = h_softmax(
-        x,
-        batch_size,
-        output_size,
-        h_softmax_level1_size,
-        h_softmax_level2_size,
-        W1,
-        b1,
-        W2,
-        b2,
-    )
-    fun_output_tg = pytensor.function([x, y], y_hat_tg)
-    fun_output = pytensor.function([x], y_hat_all)
-    x_mat = rng.normal(size=(batch_size, input_size)).astype(config.floatX)
-    y_mat = rng.integers(0, output_size, batch_size).astype("int32")
-    tg_output = fun_output_tg(x_mat, y_mat)
-    all_outputs = fun_output(x_mat)
-    assert tg_output.shape == (batch_size,)
-    assert all_outputs.shape == (batch_size, output_size)
-    # Verifies that the outputs computed by fun_output_tg are the same as those
-    # computed by fun_output.
-    utt.assert_allclose(all_outputs[np.arange(0, batch_size), y_mat], tg_output)
-def test_elu():
-    x = matrix("x")
-    rng = np.random.default_rng(utt.fetch_seed())
-    X = rng.standard_normal((20, 30)).astype(config.floatX)
-    # test the base case, without custom alpha value
-    y = elu(x).eval({x: X})
-    utt.assert_allclose(y, np.where(X > 0, X, np.exp(X) - 1))
-    # test for different constant alpha values
-    for alpha in 1.5, 2, -1, -1.5, -2:
-        y = elu(x, alpha).eval({x: X})
-        utt.assert_allclose(y, np.where(X > 0, X, alpha * (np.exp(X) - 1)))
-def test_selu():
-    alpha = 1.6732632423543772848170429916717
-    scale = 1.0507009873554804934193349852946
-    x = matrix("x")
-    rng = np.random.default_rng(utt.fetch_seed())
-    X = rng.standard_normal((20, 30)).astype(config.floatX)
-    y = selu(x).eval({x: X})
-    utt.assert_allclose(y, np.where(X > 0, scale * X, scale * alpha * (np.exp(X) - 1)))
-def test_binary_crossentropy_reshape():
-    # Reported as https://github.com/Theano/Theano/issues/4086
-    a = tensor4("a")
-    for c in (
-        binary_crossentropy(sigmoid(a.reshape((-1, 1))), 1).sum(),
-        binary_crossentropy(sigmoid(a).reshape((-1, 1)), 1).sum(),
-    ):
-        ga = pytensor.grad(c, a)
-        # This only works when "specialize" options are included
-        mode = pytensor.compile.get_default_mode().including("fast_run")
-        fga = pytensor.function([a], ga, mode=mode)
-        utt.assert_allclose(
-            fga(np.array([[[[30.0]]]], dtype=config.floatX)),
-            np.zeros((1, 1, 1, 1), dtype=config.floatX),
-        )
-TestSoftsign = makeBroadcastTester(
-    op=softsign,
-    expected=upcast_int8_nfunc(
-        lambda inputs: check_floatX(inputs, inputs / (1.0 + np.fabs(inputs)))
-    ),
-    good=_good_broadcast_unary_normal_float_no_complex,
-    name="SoftsignTester",
-)
-class TestSigmoidBinaryCrossentropy:
-    def test_matches_binary_crossentropy(self):
-        # Test sigmoid_binary_crossentropy(p, t) ==
-        #      binary_crossentropy(sigmoid(p), t).
-        pred, target = inputs = vectors("pt")
-        reference_val = binary_crossentropy(sigmoid(pred), target)
-        f_reference = pytensor.function(inputs, reference_val)
-        test_val = sigmoid_binary_crossentropy(pred, target)
-        f_test = pytensor.function(inputs, test_val)
-        rng = np.random.default_rng(utt.fetch_seed())
-        pred, target = rng.standard_normal((2, 50)).astype(config.floatX)
-        test_inputs = [pred, 1 / (1 + np.exp(-target))]
-        utt.assert_allclose(f_reference(*test_inputs), f_test(*test_inputs))
-    def test_grad(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        pred, target = rng.standard_normal((2, 50)).astype(config.floatX)
-        test_inputs = [pred, 1 / (1 + np.exp(-target))]
-        utt.verify_grad(sigmoid_binary_crossentropy, test_inputs)
-def test_confusion_matrix():
-    # Defining numpy implementation of confusion matrix
-    def numpy_conf_mat(actual, pred):
-        order = np.union1d(actual, pred)
-        colA = np.matrix(actual).T
-        colP = np.matrix(pred).T
-        oneHotA = colA.__eq__(order).astype("int64")
-        oneHotP = colP.__eq__(order).astype("int64")
-        conf_mat = np.dot(oneHotA.T, oneHotP)
-        conf_mat = np.asarray(conf_mat)
-        return [conf_mat, order]
-    x = vector()
-    y = vector()
-    f = pytensor.function([x, y], confusion_matrix(x, y))
-    list_inputs = [
-        [[0, 1, 2, 1, 0], [0, 0, 2, 1, 2]],
-        [[2, 0, 2, 2, 0, 1], [0, 0, 2, 2, 0, 2]],
-    ]
-    for case in list_inputs:
-        a = np.asarray(case[0])
-        b = np.asarray(case[1])
-        out_exp = numpy_conf_mat(a, b)
-        outs = f(case[0], case[1])
-        for exp_res, out in zip(out_exp, outs):
-            utt.assert_allclose(exp_res, out)
--- a/tests/tensor/nnet/test_batchnorm.py
+++ b/tests/tensor/nnet/test_batchnorm.py
-from collections import OrderedDict
-import numpy as np
-import pytest
-import pytensor
-import pytensor.tensor as at
-from pytensor.configdefaults import config
-from pytensor.tensor.math import sum as at_sum
-from pytensor.tensor.nnet import batchnorm
-from pytensor.tensor.shape import specify_broadcastable
-from pytensor.tensor.type import (
-    TensorType,
-    matrix,
-    scalar,
-    tensor3,
-    tensor4,
-    tensor5,
-    vector,
-)
-from tests import unittest_tools as utt
-def test_BNComposite():
-    with config.change_flags(compute_test_value="raise"):
-        def bn_ref(x, G, B, M, V):
-            n = (x - M) / V
-            return n * G + B
-        rng = np.random.default_rng(1234)
-        X = 1 + rng.random([10, 20]).astype("float32")
-        B = 1 + rng.random([20]).astype("float32")
-        G = 1 + rng.random([20]).astype("float32")
-        M = 1 + rng.random([20]).astype("float32")
-        V = 1 + rng.random([20]).astype("float32")
-        x = matrix("x")
-        b = vector("b")
-        g = vector("g")
-        m = vector("m")
-        v = vector("v")
-        x.tag.test_value = rng.random((2, 2)).astype(pytensor.config.floatX)
-        b.tag.test_value = rng.random(2).astype(pytensor.config.floatX)
-        g.tag.test_value = rng.random(2).astype(pytensor.config.floatX)
-        m.tag.test_value = rng.random(2).astype(pytensor.config.floatX)
-        v.tag.test_value = rng.random(2).astype(pytensor.config.floatX)
-        bn_ref_op = bn_ref(x, g, b, m, v)
-        f_ref = pytensor.function([x, b, g, m, v], [bn_ref_op])
-        res_ref = f_ref(X, G, B, M, V)
-        for mode in ["low_mem", "high_mem"]:
-            bn_op = batchnorm.batch_normalization(x, g, b, m, v, mode=mode)
-            f = pytensor.function([x, b, g, m, v], [bn_op])
-            res = f(X, G, B, M, V)
-            utt.assert_allclose(res_ref, res)
-def test_batch_normalization():
-    def bn_ref(x, G, B, M, V):
-        n = (x - M) / V
-        return n * G + B
-    rng = np.random.default_rng(1234)
-    X = 1 + rng.random([10, 20]).astype("float32")
-    B = 1 + rng.random([20]).astype("float32")
-    G = 1 + rng.random([20]).astype("float32")
-    M = 1 + rng.random([20]).astype("float32")
-    V = 1 + rng.random([20]).astype("float32")
-    x = matrix("x")
-    b = vector("b")
-    g = vector("g")
-    m = vector("m")
-    v = vector("v")
-    bn_ref_op = bn_ref(x, g, b, m, v)
-    f_ref = pytensor.function([x, g, b, m, v], [bn_ref_op])
-    res_ref = f_ref(X, G, B, M, V)
-    for mode in ["low_mem", "high_mem"]:
-        bn_op = batchnorm.batch_normalization(x, g, b, m, v, mode=mode)
-        f = pytensor.function([x, g, b, m, v], [bn_op])
-        res = f(X, G, B, M, V)
-        utt.assert_allclose(res_ref, res)
-        def bn_f(inputs, gamma, beta, mean, std):
-            return batchnorm.batch_normalization(
-                inputs, gamma, beta, mean, std, mode=mode
-            )
-        utt.verify_grad(bn_f, [X, G, B, M, V])
-    bn_ref_op = bn_ref(
-        x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)
-    )
-    f_ref = pytensor.function([x, b, g], [bn_ref_op])
-    res_ref = f_ref(X, G, B)
-    for mode in ["low_mem", "high_mem"]:
-        bn_op = batchnorm.batch_normalization(
-            x,
-            g,
-            b,
-            x.mean(axis=0, keepdims=True),
-            x.std(axis=0, keepdims=True),
-            mode=mode,
-        )
-        f = pytensor.function([x, b, g], [bn_op])
-        res = f(X, G, B)
-        utt.assert_allclose(res_ref, res)
-        def bn_f(inputs, gamma, beta, mean, std):
-            return batchnorm.batch_normalization(
-                inputs, gamma, beta, mean, std, mode=mode
-            )
-        utt.verify_grad(
-            bn_f, [X, G, B, X.mean(axis=0)[np.newaxis], X.std(axis=0)[np.newaxis]]
-        )
-def test_bn_feature_maps():
-    def bn_ref(x, G, B, M, V):
-        n = (x - M) / V
-        return n * G + B
-    rng = np.random.default_rng(1234)
-    X = 1 + rng.random([2, 3, 4, 4]).astype("float32")
-    B = 1 + rng.random([3]).astype("float32")
-    G = 1 + rng.random([3]).astype("float32")
-    M = 1 + rng.random([3]).astype("float32")
-    V = 1 + rng.random([3]).astype("float32")
-    x = tensor4("x")
-    b = vector("b")
-    g = vector("g")
-    m = vector("m")
-    v = vector("v")
-    bn_ref_op = bn_ref(
-        x,
-        g.dimshuffle("x", 0, "x", "x"),
-        b.dimshuffle("x", 0, "x", "x"),
-        m.dimshuffle("x", 0, "x", "x"),
-        v.dimshuffle("x", 0, "x", "x"),
-    )
-    f_ref = pytensor.function([x, b, g, m, v], [bn_ref_op])
-    res_ref = f_ref(X, G, B, M, V)
-    for mode in ["low_mem", "high_mem"]:
-        bn_op = batchnorm.batch_normalization(
-            x,
-            g.dimshuffle("x", 0, "x", "x"),
-            b.dimshuffle("x", 0, "x", "x"),
-            m.dimshuffle("x", 0, "x", "x"),
-            v.dimshuffle("x", 0, "x", "x"),
-            mode=mode,
-        )
-        f = pytensor.function([x, b, g, m, v], [bn_op])
-        res = f(X, G, B, M, V)
-        utt.assert_allclose(res_ref, res)
-        def conv_bn(inputs, gamma, beta, mean, std):
-            return batchnorm.batch_normalization(
-                inputs,
-                gamma.dimshuffle("x", 0, "x", "x"),
-                beta.dimshuffle("x", 0, "x", "x"),
-                mean.dimshuffle("x", 0, "x", "x"),
-                std.dimshuffle("x", 0, "x", "x"),
-                mode=mode,
-            )
-        utt.verify_grad(conv_bn, [X, G, B, M, V])
-@pytest.mark.slow
-def test_batch_normalization_train():
-    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
-        for vartype in (tensor5, tensor3, vector):
-            x, scale, bias, running_mean, running_var = (
-                vartype(n)
-                for n in ("x", "scale", "bias", "running_mean", "running_var")
-            )
-            ndim = x.ndim
-            eps = 5e-3  # some non-standard value to test if it's used
-            running_average_factor = 0.3
-            # remove non-existing axes
-            if isinstance(axes, tuple):
-                axes = tuple(i for i in axes if i < ndim)
-            if len(axes) == 0:
-                continue
-            # forward pass
-            (
-                out,
-                x_mean,
-                x_invstd,
-                out_running_mean,
-                out_running_var,
-            ) = batchnorm.batch_normalization_train(
-                x,
-                scale,
-                bias,
-                axes,
-                eps,
-                running_average_factor,
-                running_mean,
-                running_var,
-            )
-            # reference forward pass
-            if axes == "per-activation":
-                axes2 = (0,)
-            elif axes == "spatial":
-                axes2 = (0,) + tuple(range(2, ndim))
-            else:
-                axes2 = axes
-            x_mean2 = x.mean(axis=axes2, keepdims=True)
-            x_var2 = x.var(axis=axes2, keepdims=True)
-            x_invstd2 = at.reciprocal(at.sqrt(x_var2 + eps))
-            scale2 = specify_broadcastable(scale, *axes2)
-            bias2 = specify_broadcastable(bias, *axes2)
-            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
-            m = at.cast(at.prod(x.shape) / at.prod(scale.shape), pytensor.config.floatX)
-            out_running_mean2 = (
-                running_mean * (1 - running_average_factor)
-                + x_mean2 * running_average_factor
-            )
-            out_running_var2 = (
-                running_var * (1 - running_average_factor)
-                + (m / (m - 1)) * x_var2 * running_average_factor
-            )
-            # backward pass
-            dy = vartype("dy")
-            grads = at.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
-            # reference backward pass
-            grads2 = at.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
-            # second-order backward pass
-            dx = vartype("dinputs")
-            dscale = vartype("dscale")
-            dbias = vartype("dbias")
-            grad_grads = at.grad(
-                None,
-                wrt=[x, dy, scale],
-                known_grads=OrderedDict(
-                    {grads[0]: dx, grads[1]: dscale, grads[2]: dbias}
-                ),
-                consider_constant=[
-                    x,
-                    dy,
-                    scale,
-                    bias,
-                    x_mean,
-                    x_invstd,
-                    running_mean,
-                    running_var,
-                ],
-                return_disconnected="zero",
-            )
-            # reference second-order backward pass
-            grad_grads2 = at.grad(
-                None,
-                wrt=[x, dy, scale],
-                known_grads=OrderedDict(
-                    {grads2[0]: dx, grads2[1]: dscale, grads2[2]: dbias}
-                ),
-                consider_constant=[
-                    x,
-                    dy,
-                    scale,
-                    bias,
-                    x_mean2,
-                    x_var2,
-                    running_mean,
-                    running_var,
-                ],
-                return_disconnected="zero",
-            )
-            # compile
-            f = pytensor.function(
-                [x, scale, bias, running_mean, running_var, dy, dx, dscale, dbias],
-                [
-                    out,
-                    x_mean,
-                    x_invstd,
-                    out_running_mean,
-                    out_running_var,
-                    out2,
-                    x_mean2,
-                    x_invstd2,
-                    out_running_mean2,
-                    out_running_var2,
-                ]
-                + grads
-                + grads2
-                + grad_grads
-                + grad_grads2,
-            )
-            # check if the abstract Ops have been replaced
-            assert not any(
-                isinstance(
-                    n.op,
-                    (
-                        batchnorm.AbstractBatchNormTrain,
-                        batchnorm.AbstractBatchNormInference,
-                        batchnorm.AbstractBatchNormTrainGrad,
-                    ),
-                )
-                for n in f.maker.fgraph.toposort()
-            )
-            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)):
-                data_shape = data_shape[:ndim]
-                param_shape = tuple(
-                    1 if d in axes2 else s for d, s in enumerate(data_shape)
-                )
-                rng = np.random.default_rng(1234)
-                X = 4 + 3 * rng.random(data_shape).astype(pytensor.config.floatX)
-                Dy = -1 + 2 * rng.random(data_shape).astype(pytensor.config.floatX)
-                Scale = rng.random(param_shape).astype(pytensor.config.floatX)
-                Bias = rng.random(param_shape).astype(pytensor.config.floatX)
-                Running_mean = rng.random(param_shape).astype(pytensor.config.floatX)
-                Running_var = rng.random(param_shape).astype(pytensor.config.floatX)
-                Dx = 4 + 3 * rng.random(data_shape).astype(pytensor.config.floatX)
-                Dscale = -1 + 2 * rng.random(param_shape).astype(pytensor.config.floatX)
-                Dbias = rng.random(param_shape).astype(pytensor.config.floatX)
-                outputs = f(
-                    X, Scale, Bias, Running_mean, Running_var, Dy, Dx, Dscale, Dbias
-                )
-                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
-                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
-                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
-                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
-                utt.assert_allclose(
-                    np.nan_to_num(outputs[4]), np.nan_to_num(outputs[4 + 5])
-                )  # running_var
-                # compare gradients
-                utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4)  # dx
-                utt.assert_allclose(
-                    outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4
-                )  # dscale
-                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
-                # compare second-order gradients
-                utt.assert_allclose(outputs[16], outputs[16 + 3], atol=1e-4)  # ddx
-                utt.assert_allclose(outputs[17], outputs[17 + 3])  # ddy
-                utt.assert_allclose(
-                    outputs[18], outputs[18 + 3], rtol=3e-4, atol=1e-4
-                )  # ddscale
-@pytest.mark.slow
-def test_batch_normalization_train_grad_grad():
-    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
-        for vartype in (tensor5, tensor4, tensor3, matrix, vector):
-            # run these experiments with float64 for sufficient numerical stability
-            x, dy, scale, x_mean, x_invstd = (
-                vartype(n, dtype="float64")
-                for n in ("x", "dy", "scale", "x_mean", "x_invstd")
-            )
-            ndim = x.ndim
-            # reference forward pass
-            if axes == "per-activation":
-                axes = (0,)
-            elif axes == "spatial":
-                axes = (0,) + tuple(range(2, ndim))
-            else:
-                # remove non-existing axes
-                axes = tuple(i for i in axes if i < ndim)
-            if len(axes) == 0:
-                continue
-            def bn_grad_wrt_inputs_f(x, dy, scale, x_mean, x_invstd):
-                g_inputs, g_scale, g_bias = batchnorm.AbstractBatchNormTrainGrad(axes)(
-                    x, dy, scale, x_mean, x_invstd
-                )
-                return g_inputs
-            def bn_grad_wrt_scale_f(x, dy, scale, x_mean, x_invstd):
-                g_inputs, g_scale, g_bias = batchnorm.AbstractBatchNormTrainGrad(axes)(
-                    x, dy, scale, x_mean, x_invstd
-                )
-                return g_scale
-            def bn_grad_wrt_bias_f(x, dy, scale, x_mean, x_invstd):
-                g_inputs, g_scale, g_bias = batchnorm.AbstractBatchNormTrainGrad(axes)(
-                    x, dy, scale, x_mean, x_invstd
-                )
-                return g_bias
-            # run
-            for data_shape in ((4, 3, 3, 3, 3), (4, 3, 1, 1, 1), (2, 3, 5, 3, 2)):
-                data_shape = data_shape[:ndim]
-                param_shape = tuple(
-                    1 if d in axes else s for d, s in enumerate(data_shape)
-                )
-                rng = np.random.default_rng(1234)
-                # force float64 for sufficient numerical stability
-                x_val = 4 + 3 * rng.random(data_shape).astype("float64")
-                dy_val = -1 + 2 * rng.random(data_shape).astype("float64")
-                scale_val = rng.random(param_shape).astype("float64")
-                x_mean_val = rng.random(param_shape).astype("float64")
-                x_invstd_val = rng.random(param_shape).astype("float64")
-                utt.verify_grad(
-                    bn_grad_wrt_inputs_f,
-                    [x_val, dy_val, scale_val, x_mean_val, x_invstd_val],
-                    abs_tol=5e-4,
-                    rel_tol=5e-4,
-                )
-                utt.verify_grad(
-                    bn_grad_wrt_scale_f,
-                    [x_val, dy_val, scale_val, x_mean_val, x_invstd_val],
-                )
-                utt.verify_grad(
-                    bn_grad_wrt_bias_f,
-                    [x_val, dy_val, scale_val, x_mean_val, x_invstd_val],
-                )
-def test_batch_normalization_train_without_running_averages():
-    # compile and run batch_normalization_train without running averages
-    x, scale, bias, dy = (
-        tensor4("x"),
-        tensor4("scale"),
-        tensor4("bias"),
-        tensor4("dy"),
-    )
-    data_shape = (5, 10, 30, 25)
-    param_shape = (1, 10, 30, 25)
-    # forward pass
-    out, x_mean, x_invstd = batchnorm.batch_normalization_train(
-        x, scale, bias, "per-activation"
-    )
-    # backward pass
-    grads = at.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
-    # compile
-    f = pytensor.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads)
-    # check if the abstract Ops have been replaced
-    assert not any(
-        isinstance(
-            n.op,
-            (
-                batchnorm.AbstractBatchNormTrain,
-                batchnorm.AbstractBatchNormInference,
-                batchnorm.AbstractBatchNormTrainGrad,
-            ),
-        )
-        for n in f.maker.fgraph.toposort()
-    )
-    # run
-    rng = np.random.default_rng(1234)
-    X = 4 + 3 * rng.random(data_shape).astype(pytensor.config.floatX)
-    Dy = -1 + 2 * rng.random(data_shape).astype(pytensor.config.floatX)
-    Scale = rng.random(param_shape).astype(pytensor.config.floatX)
-    Bias = rng.random(param_shape).astype(pytensor.config.floatX)
-    f(X, Scale, Bias, Dy)
-def test_batch_normalization_train_broadcast():
-    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
-        for vartype in (tensor5, tensor4, tensor3, matrix, vector):
-            x = vartype("x")
-            ndim = x.ndim
-            eps = 5e-3  # some non-standard value to test if it's used
-            running_average_factor = 0.3
-            # remove non-existing axes
-            if isinstance(axes, tuple):
-                axes = tuple(i for i in axes if i < ndim)
-            if len(axes) == 0:
-                continue
-            # convert axes to explicit list
-            if axes == "per-activation":
-                axes2 = (0,)
-            elif axes == "spatial":
-                axes2 = (0,) + tuple(range(2, ndim))
-            else:
-                axes2 = axes
-            # compute axes for parameter tensors
-            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
-            params_dimshuffle = ["x"] * ndim
-            for i, axis in enumerate(non_bc_axes):
-                params_dimshuffle[axis] = i
-            # construct non-broadcasted parameter variables
-            param_type = TensorType(x.dtype, shape=(None,) * len(non_bc_axes))
-            scale, bias, running_mean, running_var = (
-                param_type(n) for n in ("scale", "bias", "running_mean", "running_var")
-            )
-            # broadcast parameter variables
-            scale_bc = scale.dimshuffle(params_dimshuffle)
-            bias_bc = bias.dimshuffle(params_dimshuffle)
-            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
-            running_var_bc = running_var.dimshuffle(params_dimshuffle)
-            # batch_normalization_train with original, non-broadcasted variables
-            train_non_bc = batchnorm.batch_normalization_train(
-                x,
-                scale,
-                bias,
-                axes,
-                eps,
-                running_average_factor,
-                running_mean,
-                running_var,
-            )
-            # batch_normalization_train with broadcasted variables
-            train_bc = batchnorm.batch_normalization_train(
-                x,
-                scale_bc,
-                bias_bc,
-                axes,
-                eps,
-                running_average_factor,
-                running_mean_bc,
-                running_var_bc,
-            )
-            train_bc = tuple(
-                [train_bc[0]] + [r.dimshuffle(non_bc_axes) for r in train_bc[1:]]  # out
-            )
-            # batch_normalization_test with original, non-broadcasted variables
-            test_non_bc = batchnorm.batch_normalization_test(
-                x, scale, bias, running_mean, running_var, axes, eps
-            )
-            # batch_normalization_test with broadcasted variables
-            test_bc = batchnorm.batch_normalization_test(
-                x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps
-            )
-            # subtract the results of the non-broadcasted and broadcasted calls
-            results_non_bc = train_non_bc + (test_non_bc,)
-            results_bc = train_bc + (test_bc,)
-            results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)]
-            # compile to compute all differences
-            f = pytensor.function(
-                [x, scale, bias, running_mean, running_var], at_sum(sum(results))
-            )
-            # the paired ops are exactly the same, so the optimizer should have
-            # collapsed the sum of differences to a constant zero
-            nodes = f.maker.fgraph.toposort()
-            if pytensor.config.mode != "FAST_COMPILE":
-                assert len(nodes) == 1
-                assert isinstance(nodes[0].op, pytensor.compile.DeepCopyOp)
-            inputs = [
-                np.asarray(np.random.random((4,) * n), x.dtype)
-                for n in [
-                    x.ndim,
-                    scale.ndim,
-                    bias.ndim,
-                    running_mean.ndim,
-                    running_var.ndim,
-                ]
-            ]
-            assert 0.0 == f(*inputs)
-@pytest.mark.slow
-def test_batch_normalization_test():
-    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
-        for vartype in (tensor5, tensor3, vector):
-            x, scale, bias, mean, var = (
-                vartype(n) for n in ("x", "scale", "bias", "mean", "var")
-            )
-            ndim = x.ndim
-            eps = 5e-3  # some non-standard value to test if it's used
-            # remove non-existing axes
-            if isinstance(axes, tuple):
-                axes = tuple(i for i in axes if i < ndim)
-            if len(axes) == 0:
-                continue
-            # forward pass
-            out = batchnorm.batch_normalization_test(
-                x, scale, bias, mean, var, axes, eps
-            )
-            # reference forward pass
-            if axes == "per-activation":
-                axes2 = (0,)
-            elif axes == "spatial":
-                axes2 = (0,) + tuple(range(2, ndim))
-            else:
-                axes2 = axes
-            scale2, bias2, mean2, var2 = (
-                specify_broadcastable(t, *axes2) for t in (scale, bias, mean, var)
-            )
-            out2 = (x - mean2) * (scale2 / at.sqrt(var2 + eps)) + bias2
-            # backward pass
-            dy = vartype("dy")
-            grads = at.grad(
-                None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}
-            )
-            # reference backward pass
-            grads2 = at.grad(
-                None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}
-            )
-            # compile
-            f = pytensor.function(
-                [x, scale, bias, mean, var, dy], [out, out2] + grads + grads2
-            )
-            # check if the abstract Ops have been replaced
-            assert not any(
-                isinstance(
-                    n.op,
-                    (
-                        batchnorm.AbstractBatchNormTrain,
-                        batchnorm.AbstractBatchNormInference,
-                        batchnorm.AbstractBatchNormTrainGrad,
-                    ),
-                )
-                for n in f.maker.fgraph.toposort()
-            )
-            # run
-            for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
-                data_shape = data_shape[:ndim]
-                param_shape = tuple(
-                    1 if d in axes2 else s for d, s in enumerate(data_shape)
-                )
-                rng = np.random.default_rng(1234)
-                X = 4 + 3 * rng.random(data_shape).astype(pytensor.config.floatX)
-                Dy = -1 + 2 * rng.random(data_shape).astype(pytensor.config.floatX)
-                Scale = rng.random(param_shape).astype(pytensor.config.floatX)
-                Bias = rng.random(param_shape).astype(pytensor.config.floatX)
-                Mean = rng.random(param_shape).astype(pytensor.config.floatX)
-                Var = rng.random(param_shape).astype(pytensor.config.floatX)
-                outputs = f(X, Scale, Bias, Mean, Var, Dy)
-                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[1])  # out
-                # compare gradients
-                utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5)  # dx
-                utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5)  # dscale
-                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
-                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
-                utt.assert_allclose(
-                    outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5
-                )  # dvar
-def test_batch_normalization_broadcastable():
-    # check if the broadcastable pattern is preserved by the optimizations
-    x, dy, scale, bias, mean, var = (
-        scalar(n).dimshuffle(["x"] * 5)
-        for n in ("x", "dy", "scale", "bias", "mean", "var")
-    )
-    # forward pass
-    out_train, x_mean, x_invstd = batchnorm.batch_normalization_train(
-        x, scale, bias, "spatial"
-    )
-    out_test = batchnorm.batch_normalization_test(x, scale, bias, mean, var, "spatial")
-    # backward pass
-    grads_train = at.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
-    grads_test = at.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy})
-    # compile
-    f = pytensor.function(
-        [x, scale, bias, mean, var, dy],
-        [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test,
-    )
-    assert not any(
-        isinstance(
-            n.op,
-            (
-                batchnorm.AbstractBatchNormTrain,
-                batchnorm.AbstractBatchNormInference,
-                batchnorm.AbstractBatchNormTrainGrad,
-            ),
-        )
-        for n in f.maker.fgraph.toposort()
-    )
--- a/tests/tensor/nnet/test_blocksparse.py
+++ b/tests/tensor/nnet/test_blocksparse.py
-"""
-    Tests for block sparse dot
-"""
-import numpy as np
-import pytensor
-import pytensor.tensor as at
-import tests.unittest_tools as utt
-from pytensor.tensor.elemwise import DimShuffle
-from pytensor.tensor.nnet.blocksparse import (
-    SparseBlockGemv,
-    SparseBlockOuter,
-    sparse_block_dot,
-    sparse_block_gemv,
-    sparse_block_outer,
-)
-from pytensor.tensor.type import fmatrix, ftensor3, ftensor4, imatrix
-class TestBlockSparseGemvAndOuter(utt.InferShapeTester):
-    def setup_method(self):
-        mode = None
-        if pytensor.config.mode == "FAST_COMPILE":
-            mode = "FAST_RUN"
-        self.mode = pytensor.compile.get_mode(mode).excluding("constant_folding")
-        self.gemv_op = sparse_block_gemv
-        self.outer_op = sparse_block_outer
-        self.gemv_class = SparseBlockGemv
-        self.outer_class = SparseBlockOuter
-        super().setup_method()
-    @staticmethod
-    def gemv_data():
-        nInputBlock = 8
-        nOutputBlock = 7
-        inputSize = 6
-        outputSize = 5
-        inputWindowSize = 4
-        outputWindowSize = 3
-        batchSize = 2
-        rng = np.random.default_rng(230920)
-        input = rng.standard_normal((batchSize, inputWindowSize, inputSize)).astype(
-            "float32"
-        )
-        inputIndice = np.vstack(
-            rng.permutation(nInputBlock)[:inputWindowSize] for _ in range(batchSize)
-        ).astype("int32")
-        outputIndice = np.vstack(
-            rng.permutation(nOutputBlock)[:outputWindowSize] for _ in range(batchSize)
-        ).astype("int32")
-        weight = rng.standard_normal(
-            (nInputBlock, nOutputBlock, inputSize, outputSize)
-        ).astype("float32")
-        bias = rng.standard_normal((nOutputBlock, outputSize)).astype("float32")
-        return weight, input, inputIndice, bias, outputIndice
-    @staticmethod
-    def outer_data():
-        nInputBlock = 8
-        nOutputBlock = 7
-        xSize = 6
-        ySize = 5
-        xWindowSize = 4
-        yWindowSize = 3
-        batchSize = 2
-        rng = np.random.default_rng(230920)
-        o = rng.standard_normal((nInputBlock, nOutputBlock, xSize, ySize)).astype(
-            "float32"
-        )
-        x = rng.standard_normal((batchSize, xWindowSize, xSize)).astype("float32")
-        y = rng.standard_normal((batchSize, yWindowSize, ySize)).astype("float32")
-        xIdx = np.vstack(
-            rng.integers(0, nInputBlock, size=xWindowSize) for _ in range(batchSize)
-        ).astype("int32")
-        yIdx = np.vstack(
-            rng.integers(0, nOutputBlock, size=yWindowSize) for _ in range(batchSize)
-        ).astype("int32")
-        return o, x, y, xIdx, yIdx
-    @staticmethod
-    def gemv_numpy(o, W, h, iIdx, oIdx):
-        for b in range(o.shape[0]):
-            for j in range(o.shape[1]):
-                outputIdx = oIdx[b, j]
-                for i in range(h.shape[1]):
-                    inputIdx = iIdx[b, i]
-                    w = W[inputIdx, outputIdx]
-                    o[b, j, :] += np.dot(h[b, i], w)
-        return o
-    @staticmethod
-    def gemv_numpy2(o, W, h, iIdx, oIdx):
-        """
-        Other implementation
-        """
-        from numpy import ix_
-        for b in range(o.shape[0]):
-            w = W[ix_(iIdx[b], oIdx[b])].swapaxes(1, 2)
-            w = w.reshape((w.shape[0] * w.shape[1], w.shape[2] * w.shape[3]))
-            o[b] += np.dot(h[b].ravel(), w).reshape(o.shape[1:])
-        return o
-    @staticmethod
-    def gemv_numpy3(o, W, h, iIdx, oIdx):
-        """
-        Other implementation
-        """
-        from numpy import ix_
-        for b in range(o.shape[0]):
-            w = W[ix_(iIdx[b], oIdx[b])]
-            # The next three lines do the same operation. The last one is the
-            # fastest
-            # o[b] += (h[b][:, None, :, None] * w).sum(axis=(0, 2))
-            # o[b] += np.tensordot(h[b], w, [(0,1),(0,2)])
-            o[b] += np.einsum("ik,ijkl", h[b], w)
-        return o
-    @staticmethod
-    def outer_numpy(o, x, y, xIdx, yIdx):
-        for b in range(x.shape[0]):
-            for i in range(xIdx.shape[1]):
-                for j in range(yIdx.shape[1]):
-                    o[xIdx[b, i], yIdx[b, j]] += np.outer(x[b, i, :], y[b, j, :])
-        return o
-    def test_sparseblockdot(self):
-        # Compares the numpy version of sparseblockgemv to sparse_block_dot.
-        b = fmatrix()
-        W = ftensor4()
-        h = ftensor3()
-        iIdx = imatrix()
-        oIdx = imatrix()
-        o = sparse_block_dot(W, h, iIdx, b, oIdx)
-        f = pytensor.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
-        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-        th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-        ref_out = self.gemv_numpy(
-            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val
-        )
-        utt.assert_allclose(ref_out, th_out)
-    def test_sparseblockgemv(self):
-        # Compares the numpy and pytensor versions of sparseblockgemv.
-        b = fmatrix()
-        W = ftensor4()
-        h = ftensor3()
-        iIdx = imatrix()
-        oIdx = imatrix()
-        o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-        f = pytensor.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
-        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-        th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-        ref_out = self.gemv_numpy(
-            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val
-        )
-        utt.assert_allclose(ref_out, th_out)
-    def test_sparseblockgemvF(self):
-        # Test the fortran order for W (which can happen in the grad for some
-        # graphs).
-        b = fmatrix()
-        W = ftensor4()
-        h = ftensor3()
-        iIdx = imatrix()
-        oIdx = imatrix()
-        o = self.gemv_op(
-            b.take(oIdx, axis=0),
-            DimShuffle((False, False, False, False), (0, 1, 3, 2))(
-                at.as_tensor_variable(W)
-            ),
-            h,
-            iIdx,
-            oIdx,
-        )
-        f = pytensor.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
-        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-        th_out = f(np.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
-        ref_out = self.gemv_numpy(
-            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val
-        )
-        utt.assert_allclose(ref_out, th_out)
-    def test_sparseblockgemv_grad(self):
-        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-        iIdx = at.constant(iIdx_val)
-        oIdx = at.constant(oIdx_val)
-        def metaop(b, h, W):
-            return sparse_block_dot(W, h, iIdx, b, oIdx)
-        def op(b, h, W):
-            return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-        eps = 3e-3
-        utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode, eps=eps)
-        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)
-    def test_sparseblockgemv_grad_1(self):
-        # Test that we correctly handle cases where dimensions are 1.
-        rng = np.random.default_rng(230920)
-        h_val = rng.standard_normal((1, 1, 1)).astype("float32")
-        iIdx_val = rng.permutation(1)[:1][None, :]
-        oIdx_val = rng.permutation(1)[:1][None, :]
-        W_val = rng.standard_normal((1, 1, 1, 1)).astype("float32")
-        b_val = rng.standard_normal((1, 1)).astype("float32")
-        iIdx = at.constant(iIdx_val)
-        oIdx = at.constant(oIdx_val)
-        def metaop(b, h, W):
-            return sparse_block_dot(W, h, iIdx, b, oIdx)
-        def op(b, h, W):
-            return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-        utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode)
-        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode)
-    def test_sparseblockgemv_grad_shape(self):
-        b = fmatrix()
-        W = ftensor4()
-        h = ftensor3()
-        iIdx = imatrix()
-        oIdx = imatrix()
-        o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-        go = pytensor.grad(o.sum(), [b, W, h])
-        f = pytensor.function([W, h, iIdx, b, oIdx], go, mode=self.mode)
-        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-        # just make sure that it runs correctly and all the shapes are ok.
-        b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
-        assert b_g.shape == b_val.shape
-        assert h_g.shape == h_val.shape
-        assert W_g.shape == W_val.shape
-    def test_sparseblockouter(self):
-        o = ftensor4()
-        x = ftensor3()
-        y = ftensor3()
-        xIdx = imatrix()
-        yIdx = imatrix()
-        out = self.outer_op(o, x, y, xIdx, yIdx)
-        f = pytensor.function(
-            [o, x, y, xIdx, yIdx], out, on_unused_input="warn", mode=self.mode
-        )
-        (
-            o_val,
-            x_val,
-            y_val,
-            xIdx_val,
-            yIdx_val,
-        ) = self.outer_data()
-        th_out = f(o_val, x_val, y_val, xIdx_val, yIdx_val)
-        ref_out = self.outer_numpy(o_val, x_val, y_val, xIdx_val, yIdx_val)
-        utt.assert_allclose(ref_out, th_out)
-    def test_dot_infershape(self):
-        b = fmatrix()
-        W = ftensor4()
-        h = ftensor3()
-        iIdx = imatrix()
-        oIdx = imatrix()
-        self._compile_and_check(
-            [W, h, iIdx, b, oIdx],
-            [sparse_block_dot(W, h, iIdx, b, oIdx)],
-            self.gemv_data(),
-            self.gemv_class,
-        )
-    def test_gemv_infershape(self):
-        b = fmatrix()
-        W = ftensor4()
-        h = ftensor3()
-        iIdx = imatrix()
-        oIdx = imatrix()
-        self._compile_and_check(
-            [W, h, iIdx, b, oIdx],
-            [self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)],
-            self.gemv_data(),
-            self.gemv_class,
-        )
-    def test_outer_infershape(self):
-        o = ftensor4()
-        x = ftensor3()
-        y = ftensor3()
-        xIdx = imatrix()
-        yIdx = imatrix()
-        self._compile_and_check(
-            [o, x, y, xIdx, yIdx],
-            [self.outer_op(o, x, y, xIdx, yIdx)],
-            self.outer_data(),
-            self.outer_class,
-        )
--- a/tests/tensor/nnet/test_conv.py
+++ b/tests/tensor/nnet/test_conv.py
-import time
-import numpy as np
-import pytest
-import pytensor
-import pytensor.tensor as at
-from pytensor.compile.mode import Mode
-from pytensor.tensor.exceptions import NotScalarConstantError
-from pytensor.tensor.math import _allclose, exp
-from pytensor.tensor.nnet import conv, conv2d
-from pytensor.tensor.type import dmatrix, dtensor3, dtensor4, dvector, scalar, tensor4
-from tests import unittest_tools as utt
-@pytest.mark.skipif(
-    pytensor.config.cxx == "",
-    reason="conv2d tests need SciPy or a c++ compiler",
-)
-class TestConv2D(utt.InferShapeTester):
-    # This class contains tests for the legacy 2d convolution,
-    # but will also be inherited from for other implementations
-    mode = None
-    dtype = pytensor.config.floatX
-    # This will be set to the appropriate function in the inherited classes.
-    # The call to `staticmethod` is necessary to prevent Python from passing
-    # `self` as the first argument.
-    conv2d = staticmethod(conv2d)
-    def setup_method(self):
-        self.input = tensor4("input", dtype=self.dtype)
-        self.input.name = "default_V"
-        self.filters = tensor4("filters", dtype=self.dtype)
-        self.filters.name = "default_filters"
-        super().setup_method()
-    def validate(
-        self,
-        image_shape,
-        filter_shape,
-        border_mode="valid",
-        subsample=(1, 1),
-        N_image_shape=None,
-        N_filter_shape=None,
-        input=None,
-        filters=None,
-        unroll_batch=None,
-        unroll_kern=None,
-        unroll_patch=None,
-        verify_grad=True,
-        should_raise=False,
-    ):
-        """
-        :param image_shape: The constant shape info passed to conv2d.
-        :param filter_shape: The constant shape info passed to conv2d.
-        :param N_image_shape: None(default to image_shape) or tuple of
-                              4 elements with the shape of the input image
-        :param N_filter_shape: None(default to filter_shape) or tuple
-                               of 4 elements with the shape of the
-                               input filter
-        """
-        if N_image_shape is None:
-            N_image_shape = [
-                at.get_scalar_constant_value(at.as_tensor_variable(x))
-                for x in image_shape
-            ]
-        if N_filter_shape is None:
-            N_filter_shape = [
-                at.get_scalar_constant_value(at.as_tensor_variable(x))
-                for x in filter_shape
-            ]
-        if input is None:
-            input = self.input
-        if not filters:
-            filters = self.filters
-        # PYTENSOR IMPLEMENTATION
-        # we create a symbolic function so that verify_grad can work
-        def sym_conv2d(input, filters):
-            # define pytensor graph and function
-            input.name = "input"
-            filters.name = "filters"
-            with pytest.warns(DeprecationWarning):
-                rval = conv.conv2d(
-                    input,
-                    filters,
-                    image_shape,
-                    filter_shape,
-                    border_mode,
-                    subsample,
-                    unroll_batch=unroll_batch,
-                    unroll_kern=unroll_kern,
-                    unroll_patch=unroll_patch,
-                )
-            rval.name = "conv_output"
-            return rval
-        output = sym_conv2d(input, filters)
-        output.name = f"conv2d({input.name},{filters.name})"
-        pytensor_conv = pytensor.function([input, filters], output, mode=self.mode)
-        # initialize input and compute result
-        image_data = np.random.random(N_image_shape).astype(self.dtype)
-        filter_data = np.random.random(N_filter_shape).astype(self.dtype)
-        try:
-            pytensor_output = pytensor_conv(image_data, filter_data)
-        except ValueError:
-            if not should_raise:
-                raise
-            return
-        else:
-            if should_raise:
-                raise Exception("ConvOp should have generated an error")
-        # REFERENCE IMPLEMENTATION
-        s = 1.0
-        orig_image_data = image_data
-        if border_mode != "full":
-            s = -1.0
-        out_shape2d = (
-            np.array(N_image_shape[-2:]) + s * np.array(N_filter_shape[-2:]) - s
-        )
-        out_shape2d = np.ceil(out_shape2d / np.array(subsample))
-        # avoid numpy deprecation
-        out_shape2d = out_shape2d.astype("int32")
-        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d)
-        ref_output = np.zeros(out_shape)
-        # loop over output feature maps
-        ref_output.fill(0)
-        if border_mode == "full":
-            image_data2 = np.zeros(
-                (
-                    N_image_shape[0],
-                    N_image_shape[1],
-                    N_image_shape[2] + 2 * N_filter_shape[2] - 2,
-                    N_image_shape[3] + 2 * N_filter_shape[3] - 2,
-                )
-            )
-            image_data2[
-                :,
-                :,
-                N_filter_shape[2] - 1 : N_filter_shape[2] - 1 + N_image_shape[2],
-                N_filter_shape[3] - 1 : N_filter_shape[3] - 1 + N_image_shape[3],
-            ] = image_data
-            image_data = image_data2
-            N_image_shape = image_data.shape
-        for bb in range(N_image_shape[0]):
-            for nn in range(N_filter_shape[0]):
-                for im0 in range(N_image_shape[1]):
-                    filter2d = filter_data[nn, im0, :, :]
-                    image2d = image_data[bb, im0, :, :]
-                    for row in range(ref_output.shape[2]):
-                        irow = row * subsample[0]  # image row
-                        for col in range(ref_output.shape[3]):
-                            icol = col * subsample[1]  # image col
-                            ref_output[bb, nn, row, col] += (
-                                image2d[
-                                    irow : irow + N_filter_shape[2],
-                                    icol : icol + N_filter_shape[3],
-                                ]
-                                * filter2d[::-1, ::-1]
-                            ).sum()
-        assert _allclose(pytensor_output, ref_output)
-        # TEST GRADIENT
-        if verify_grad:
-            utt.verify_grad(sym_conv2d, [orig_image_data, filter_data])
-    def test_basic1(self):
-        # Tests that basic convolutions work for odd and even
-        # dimensions of image and filter shapes, as well as rectangular
-        # images and filters.
-        self.validate((2, 2, 3, 3), (2, 2, 2, 2), "valid", verify_grad=False)
-    def test_basic(self):
-        # Tests that basic convolutions work for odd and even
-        # dimensions of image and filter shapes, as well as rectangular
-        # images and filters.
-        self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", verify_grad=False)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid")
-        self.validate((3, 2, 7, 5), (5, 2, 3, 2), "valid", verify_grad=False)
-        self.validate((3, 2, 8, 8), (4, 2, 5, 5), "full", verify_grad=False)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full")
-        # test filter same size as input
-    def test_uint_image_shape_datatype(self):
-        # Tests for uint datatype in image_shape.
-        self.validate((2, 2, 3, np.uint8(3)), (3, 2, 3, 3), "valid", verify_grad=False)
-        self.validate((np.uint16(2), 2, 3, 3), (3, 2, 3, 3), "valid", verify_grad=False)
-        self.validate((2, np.uint32(2), 3, 3), (3, 2, 3, 3), "valid", verify_grad=False)
-    def test_uint_filter_shape_datatype(self):
-        # Tests for uint datatype in filter_shape
-        self.validate((3, 2, 3, 3), (2, 2, 3, np.uint8(3)), "valid", verify_grad=False)
-        self.validate((3, 2, 3, 3), (np.uint16(2), 2, 3, 3), "valid", verify_grad=False)
-        self.validate((3, 2, 3, 3), (2, np.uint32(2), 3, 3), "valid", verify_grad=False)
-    def test_img_kernel_same_shape(self):
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), "full")
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), "valid")
-    def test_unroll_patch_true(self):
-        # Test basic convs with True.
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", unroll_patch=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", unroll_patch=True)
-        self.validate(
-            (3, 2, 3, 3), (4, 2, 3, 3), "valid", unroll_patch=True, verify_grad=False
-        )
-    def test_unroll_patch_false(self):
-        # Test basic convs with False.
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", unroll_patch=False)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", unroll_patch=False)
-        self.validate(
-            (3, 2, 3, 3), (4, 2, 3, 3), "valid", unroll_patch=False, verify_grad=False
-        )
-    def test_unroll_patch_true_fail(self):
-        # Test basic convs with True.
-        self.validate(
-            (3, 2, 7, 5),
-            (5, 2, 2, 3),
-            "valid",
-            unroll_patch=True,
-            N_image_shape=(1, 3, 3, 3),
-            N_filter_shape=(6, 3, 2, 2),
-            should_raise=True,
-        )
-        self.validate(
-            (3, 2, 7, 5),
-            (5, 2, 2, 3),
-            "full",
-            unroll_patch=True,
-            N_image_shape=(1, 3, 3, 3),
-            N_filter_shape=(6, 3, 2, 2),
-            should_raise=True,
-        )
-        self.validate(
-            (3, 2, 3, 3),
-            (4, 2, 3, 3),
-            "valid",
-            unroll_patch=True,
-            N_image_shape=(1, 3, 3, 3),
-            N_filter_shape=(6, 3, 2, 2),
-            should_raise=True,
-        )
-    def test_unroll_special(self):
-        # (unroll_kern, unroll_batch) in (0,1),(1,0) is special case.
-        self.validate((6, 2, 3, 3), (3, 2, 2, 2), "valid", unroll_batch=1)
-    def test_unroll_batch(self):
-        # Test mini-batch unrolling for various legal values.
-        # mini-batch of size 6 is multiple of 2 and 3. Should work.
-        self.validate(
-            (6, 2, 3, 3), (3, 2, 2, 2), "valid", unroll_batch=2, verify_grad=False
-        )
-        self.validate(
-            (6, 2, 3, 3), (3, 2, 2, 2), "valid", unroll_batch=3, verify_grad=False
-        )
-    def test_unroll_kern(self):
-        # Test kernel unrolling for various legal values.
-        # 6 filters is a multiple of 2 and 3. Should work.
-        self.validate(
-            (2, 3, 3, 3), (6, 3, 2, 2), "valid", unroll_kern=2, verify_grad=False
-        )
-        self.validate(
-            (2, 3, 3, 3), (6, 3, 2, 2), "valid", unroll_kern=3, verify_grad=False
-        )
-    def test_unroll_batch_kern(self):
-        # Test mini-batch unrolling with kernel unrolling for various
-        # legal values.
-        # mini-batch of size 6 is multiple of 2 and 3. Should work.
-        self.validate(
-            (6, 2, 3, 3),
-            (3, 2, 2, 2),
-            "valid",
-            unroll_batch=2,
-            unroll_kern=3,
-            verify_grad=False,
-        )
-        self.validate(
-            (6, 2, 3, 3),
-            (3, 2, 2, 2),
-            "valid",
-            unroll_batch=3,
-            unroll_kern=3,
-            verify_grad=False,
-        )
-        # 6 filters is a multiple of 2 and 3. Should work.
-        self.validate(
-            (2, 3, 3, 3),
-            (6, 3, 2, 2),
-            "valid",
-            unroll_batch=2,
-            unroll_kern=2,
-            verify_grad=False,
-        )
-        self.validate(
-            (2, 3, 3, 3),
-            (6, 3, 2, 2),
-            "valid",
-            unroll_batch=2,
-            unroll_kern=3,
-            verify_grad=False,
-        )
-    def test_unroll_batch_kern_fail(self):
-        # Test mini-batch unrolling with kernel unrolling for various
-        # legal values, but pass bad input.  All those test must
-        # generate errors
-        # mini-batch of size 6 is multiple of 2 and 3. Should work.
-        self.validate(
-            (6, 2, 3, 3),
-            (3, 2, 2, 2),
-            "valid",
-            unroll_batch=2,
-            unroll_kern=3,
-            N_image_shape=(7, 2, 3, 3),
-            N_filter_shape=(3, 2, 2, 2),
-            should_raise=True,
-        )
-        self.validate(
-            (6, 2, 3, 3),
-            (3, 2, 2, 2),
-            "valid",
-            unroll_batch=3,
-            unroll_kern=3,
-            N_image_shape=(6, 2, 3, 3),
-            N_filter_shape=(4, 2, 2, 2),
-            should_raise=True,
-        )
-        self.validate(
-            (2, 3, 3, 3),
-            (6, 3, 2, 2),
-            "valid",
-            unroll_batch=2,
-            unroll_kern=2,
-            N_image_shape=(1, 3, 3, 3),
-            N_filter_shape=(6, 3, 2, 2),
-            should_raise=True,
-        )
-        self.validate(
-            (2, 3, 3, 3),
-            (6, 3, 2, 2),
-            "valid",
-            unroll_batch=2,
-            unroll_kern=3,
-            N_image_shape=(2, 3, 3, 3),
-            N_filter_shape=(5, 3, 2, 2),
-            should_raise=True,
-        )
-    def test_subsample(self):
-        # Tests convolution where subsampling != (1,1)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", subsample=(2, 2))
-        # Fails as of 2012-07-11
-        with pytest.raises(NotImplementedError):
-            self.validate((1, 1, 6, 6), (1, 1, 3, 3), "full", subsample=(3, 3))
-        # Fails as of 2017-08-10
-        with pytest.raises(NotImplementedError):
-            self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", subsample=(2, 2))
-        with pytest.raises(NotImplementedError):
-            self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", subsample=(2, 1))
-        with pytest.raises(NotImplementedError):
-            self.validate((1, 1, 6, 6), (1, 1, 3, 3), "valid", subsample=(3, 3))
-    def test_shape_Constant_tensor(self):
-        # Tests convolution where the {image,filter}_shape is a Constant tensor.
-        as_t = at.as_tensor_variable
-        self.validate((as_t(3), as_t(2), as_t(7), as_t(5)), (5, 2, 2, 3), "valid")
-        self.validate(as_t([3, 2, 7, 5]), (5, 2, 2, 3), "valid")
-        self.validate(as_t((3, 2, 7, 5)), (5, 2, 2, 3), "valid")
-        self.validate((3, 2, 7, 5), (as_t(5), as_t(2), as_t(2), as_t(3)), "valid")
-        self.validate((3, 2, 7, 5), as_t([5, 2, 2, 3]), "valid")
-        self.validate((3, 2, 7, 5), as_t((5, 2, 2, 3)), "valid")
-        self.validate(as_t([3, 2, 7, 5]), as_t([5, 2, 2, 3]), "full")
-    def test_invalid_filter_shape(self):
-        # Tests scenario where filter_shape[1] != input_shape[1]
-        with pytest.raises(AssertionError):
-            self.validate((3, 2, 8, 8), (4, 3, 5, 5), "valid")
-    def test_invalid_input_shape(self):
-        # Tests that when the shape given at build time is not the same as
-        # run time we raise an error
-        for unroll_batch in [None, 1, 3]:
-            for unroll_kern in [None, 2, 4]:
-                for unroll_patch in [None, True, False]:
-                    for mode in ["valid", "full"]:
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_image_shape=(2, 2, 8, 8),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_image_shape=(3, 1, 8, 8),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_image_shape=(3, 2, 7, 8),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_image_shape=(3, 2, 8, 7),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_filter_shape=(3, 2, 5, 5),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_filter_shape=(4, 1, 5, 5),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_filter_shape=(4, 2, 6, 5),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-                        with pytest.raises(ValueError):
-                            self.validate(
-                                (3, 2, 8, 8),
-                                (4, 2, 5, 5),
-                                mode,
-                                N_filter_shape=(4, 2, 5, 6),
-                                unroll_batch=unroll_batch,
-                                unroll_kern=unroll_kern,
-                                unroll_patch=unroll_patch,
-                            )
-    def test_missing_info(self):
-        # Test convolutions for various pieces of missing info.
-        self.validate(
-            None, None, N_image_shape=(3, 2, 8, 8), N_filter_shape=(4, 2, 5, 5)
-        )
-        self.validate(
-            (3, 2, None, None),
-            None,
-            N_image_shape=(3, 2, 8, 8),
-            N_filter_shape=(4, 2, 5, 5),
-        )
-        self.validate(
-            (None, 2, None, None),
-            (None, 2, 5, 5),
-            N_image_shape=(3, 2, 8, 8),
-            N_filter_shape=(4, 2, 5, 5),
-        )
-        self.validate(
-            (3, 2, 8, 8),
-            (4, 2, None, 5),
-            N_image_shape=(3, 2, 8, 8),
-            N_filter_shape=(4, 2, 5, 5),
-        )
-        self.validate(
-            (3, 2, 8, 8),
-            (4, 2, 5, None),
-            N_image_shape=(3, 2, 8, 8),
-            N_filter_shape=(4, 2, 5, 5),
-        )
-    def test_wrong_info(self):
-        # Test convolutions when we don't give a constant as shape information
-        i = pytensor.scalar.basic.int32()
-        with pytest.raises(NotScalarConstantError):
-            self.validate(
-                (3, 2, 8, i),
-                (4, 2, 5, 5),
-                N_image_shape=(3, 2, 8, 8),
-                N_filter_shape=(4, 2, 5, 5),
-            )
-        with pytest.raises(NotScalarConstantError):
-            self.validate(
-                (3, 2, 8, 8),
-                (4, 2, 5, i),
-                N_image_shape=(3, 2, 8, 8),
-                N_filter_shape=(4, 2, 5, 5),
-            )
-    def test_full_mode(self):
-        # Tests basic convolution in full mode and case where filter
-        # is larger than the input image.
-        self.validate((3, 2, 5, 5), (4, 2, 8, 8), "full")
-        def f():
-            self.validate((3, 2, 5, 5), (4, 2, 8, 8), "valid")
-        with pytest.raises(Exception):
-            f()
-    def test_wrong_input(self):
-        # Make sure errors are raised when image and kernel are not 4D tensors
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", input=dmatrix())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", filters=dvector())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", input=dtensor3())
-    def test_gcc_crash(self):
-        # gcc 4.3.0 20080428 (Red Hat 4.3.0-8)
-        #
-        # crashed in this following case. I changed the c code to don't hit
-        # gcc bug. So it should not crash anymore
-        self.validate((1, 10, 213, 129), (46, 10, 212, 1), "valid", verify_grad=False)
-    def speed(self):
-        n_calls = 20000
-        print("n_calls", n_calls)
-        for border_mode in ["valid", "full"]:
-            print()
-            print(border_mode)
-            for openmp in [False, True]:
-                print("OpenMP", openmp)
-                image_shapes = [
-                    (1, 5, 6, 6),
-                    (10, 5, 6, 6)
-                    # (10, 10, 16, 16),
-                    # (10, 10, 32, 32)]
-                ]
-                print("image_shape", image_shapes)
-                for image_shape in image_shapes:
-                    filter_shapes = [(1, 5, 4, 4), (2, 5, 4, 4), (5, 5, 4, 4)]
-                    print("filter_shapes", filter_shapes)
-                    for filter_shape in filter_shapes:
-                        input = pytensor.shared(np.random.random(image_shape))
-                        filters = pytensor.shared(np.random.random(filter_shape))
-                        with pytest.warns(DeprecationWarning):
-                            output = conv.conv2d(
-                                input,
-                                filters,
-                                image_shape,
-                                filter_shape,
-                                border_mode,
-                                unroll_patch=True,
-                                openmp=openmp,
-                            )
-                        mode = Mode(
-                            linker=pytensor.link.vm.VMLinker(
-                                allow_gc=False, use_cloop=True
-                            )
-                        )
-                        pytensor_conv = pytensor.function([], output, mode=mode)
-                        t1 = time.perf_counter()
-                        pytensor_conv.vm(n_calls=n_calls)
-                        t2 = time.perf_counter()
-                        print(t2 - t1, end=" ")
-                    print()
-    def test_infer_shape(self):
-        # Note: infer_shape is incomplete and thus input and filter shapes
-        # must be provided explicitly
-        rng = np.random.default_rng(280284)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        adtens = dtensor4()
-        bdtens = dtensor4()
-        aivec_val = [4, 5, 6, 3]
-        bivec_val = [7, 5, 3, 2]
-        adtens_val = rand(*aivec_val)
-        bdtens_val = rand(*bivec_val)
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [
-                    conv.conv2d(
-                        adtens, bdtens, aivec_val, bivec_val, border_mode="valid"
-                    )
-                ],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        aivec_val = [6, 2, 8, 3]
-        bivec_val = [4, 2, 5, 3]
-        adtens_val = rand(*aivec_val)
-        bdtens_val = rand(*bivec_val)
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [
-                    conv.conv2d(
-                        adtens, bdtens, aivec_val, bivec_val, border_mode="valid"
-                    )
-                ],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        aivec_val = [3, 6, 7, 5]
-        bivec_val = [5, 6, 3, 2]
-        adtens_val = rand(*aivec_val)
-        bdtens_val = rand(*bivec_val)
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [
-                    conv.conv2d(
-                        adtens, bdtens, aivec_val, bivec_val, border_mode="valid"
-                    )
-                ],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        aivec_val = [3, 6, 7, 5]
-        bivec_val = [5, 6, 2, 3]
-        adtens_val = rand(*aivec_val)
-        bdtens_val = rand(*bivec_val)
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [
-                    conv.conv2d(
-                        adtens, bdtens, aivec_val, bivec_val, border_mode="valid"
-                    )
-                ],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        aivec_val = [5, 2, 4, 3]
-        bivec_val = [6, 2, 4, 3]
-        adtens_val = rand(*aivec_val)
-        bdtens_val = rand(*bivec_val)
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [
-                    conv.conv2d(
-                        adtens, bdtens, aivec_val, bivec_val, border_mode="valid"
-                    )
-                ],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-        with pytest.warns(DeprecationWarning):
-            self._compile_and_check(
-                [adtens, bdtens],
-                [conv.conv2d(adtens, bdtens, aivec_val, bivec_val, border_mode="full")],
-                [adtens_val, bdtens_val],
-                conv.ConvOp,
-                excluding=["conv_gemm"],
-            )
-# Test that broadcasting of gradients works correctly when using the
-# nnet.conv2d() interface. This was reported in #3763, and uses the example
-# code from that ticket.
-def test_broadcast_grad():
-    x1 = tensor4("x")
-    sigma = scalar("sigma")
-    window_radius = 3
-    filter_1d = at.arange(-window_radius, window_radius + 1)
-    filter_1d = filter_1d.astype(pytensor.config.floatX)
-    filter_1d = exp(-0.5 * filter_1d**2 / sigma**2)
-    filter_1d = filter_1d / filter_1d.sum()
-    filter_W = filter_1d.dimshuffle(["x", "x", 0, "x"])
-    y = conv2d(x1, filter_W, border_mode="full", filter_shape=[1, 1, None, None])
-    # TODO FIXME: Make this a real test and `assert` something
-    pytensor.grad(y.sum(), sigma)
--- a/tests/tensor/nnet/test_conv3d2d.py
+++ b/tests/tensor/nnet/test_conv3d2d.py
-import numpy as np
-import pytest
-import pytensor
-try:
-    from scipy import ndimage
-except ImportError:
-    ndimage = None
-import tests.unittest_tools as utt
-from pytensor.compile.sharedvalue import shared
-from pytensor.graph.rewriting.basic import check_stack_trace
-from pytensor.tensor.nnet.conv3d2d import (
-    DiagonalSubtensor,
-    IncDiagonalSubtensor,
-    conv3d,
-    get_diagonal_subtensor_view,
-)
-def test_get_diagonal_subtensor_view(wrap=lambda a: a):
-    x = np.arange(20).reshape(5, 4).astype("float32")
-    x = wrap(x)
-    xv01 = get_diagonal_subtensor_view(x, 0, 1)
-    # test that it works in 2d
-    assert np.array_equal(np.asarray(xv01), [[12, 9, 6, 3], [16, 13, 10, 7]])
-    x = np.arange(24).reshape(4, 3, 2)
-    xv01 = get_diagonal_subtensor_view(x, 0, 1)
-    xv02 = get_diagonal_subtensor_view(x, 0, 2)
-    xv12 = get_diagonal_subtensor_view(x, 1, 2)
-    # print 'x', x
-    # print 'xv01', xv01
-    # print 'xv02', xv02
-    assert np.array_equal(
-        np.asarray(xv01), [[[12, 13], [8, 9], [4, 5]], [[18, 19], [14, 15], [10, 11]]]
-    )
-    assert np.array_equal(
-        np.asarray(xv02),
-        [
-            [[6, 1], [8, 3], [10, 5]],
-            [[12, 7], [14, 9], [16, 11]],
-            [[18, 13], [20, 15], [22, 17]],
-        ],
-    )
-    # diagonal views of each leading matrix is the same
-    # as the slices out of the diagonal view of the entire 3d tensor
-    for xi, xvi in zip(x, xv12):
-        assert np.array_equal(xvi, get_diagonal_subtensor_view(xi, 0, 1))
-def pyconv3d(signals, filters, border_mode="valid"):
-    Ns, Ts, C, Hs, Ws = signals.shape
-    Nf, Tf, C, Hf, Wf = filters.shape
-    # if border_mode is not 'valid', the signals need zero-padding
-    if border_mode == "full":
-        Tpad = Tf - 1
-        Hpad = Hf - 1
-        Wpad = Wf - 1
-    elif border_mode == "half":
-        Tpad = Tf // 2
-        Hpad = Hf // 2
-        Wpad = Wf // 2
-    else:
-        Tpad = 0
-        Hpad = 0
-        Wpad = 0
-    if Tpad > 0 or Hpad > 0 or Wpad > 0:
-        # zero-pad signals
-        signals_padded = np.zeros(
-            (Ns, Ts + 2 * Tpad, C, Hs + 2 * Hpad, Ws + 2 * Wpad), "float32"
-        )
-        signals_padded[
-            :, Tpad : (Ts + Tpad), :, Hpad : (Hs + Hpad), Wpad : (Ws + Wpad)
-        ] = signals
-        Ns, Ts, C, Hs, Ws = signals_padded.shape
-        signals = signals_padded
-    Tf2 = Tf // 2
-    Hf2 = Hf // 2
-    Wf2 = Wf // 2
-    rval = np.zeros((Ns, Ts - Tf + 1, Nf, Hs - Hf + 1, Ws - Wf + 1))
-    for ns in range(Ns):
-        for nf in range(Nf):
-            for c in range(C):
-                s_i = signals[ns, :, c, :, :]
-                f_i = filters[nf, :, c, :, :]
-                r_i = rval[ns, :, nf, :, :]
-                o_i = ndimage.convolve(s_i, f_i, mode="constant", cval=1)
-                o_i_sh0 = o_i.shape[0]
-                # print s_i.shape, f_i.shape, r_i.shape, o_i.shape
-                r_i += o_i[Tf2 : o_i_sh0 - Tf2, Hf2:-Hf2, Wf2:-Wf2]
-    return rval
-def check_diagonal_subtensor_view_traces(fn):
-    assert check_stack_trace(fn, ops_to_check=(DiagonalSubtensor, IncDiagonalSubtensor))
-@pytest.mark.skipif(
-    ndimage is None or not pytensor.config.cxx,
-    reason="conv3d2d tests need SciPy and a c++ compiler",
-)
-@pytest.mark.parametrize("border_mode", ("valid", "full", "half"))
-def test_conv3d(border_mode):
-    if pytensor.config.mode == "FAST_COMPILE":
-        mode = pytensor.compile.mode.get_mode("FAST_RUN")
-    else:
-        mode = pytensor.compile.mode.get_default_mode()
-    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
-    Nf, Tf, C, Hf, Wf = 32, 5, 3, 5, 5
-    signals = (
-        np.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype("float32")
-    )
-    filters = (
-        np.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype("float32")
-    )
-    # t0 = time.perf_counter()
-    pyres = pyconv3d(signals, filters, border_mode)
-    # print(time.perf_counter() - t0)
-    s_signals = shared(signals)
-    s_filters = shared(filters)
-    s_output = shared(signals * 0)
-    out = conv3d(
-        s_signals,
-        s_filters,
-        signals_shape=signals.shape,
-        filters_shape=filters.shape,
-        border_mode=border_mode,
-    )
-    newconv3d = pytensor.function([], [], updates={s_output: out}, mode=mode)
-    check_diagonal_subtensor_view_traces(newconv3d)
-    # t0 = time.perf_counter()
-    newconv3d()
-    # print(time.perf_counter() - t0)
-    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
-    gsignals, gfilters = pytensor.grad(out.sum(), [s_signals, s_filters])
-    gnewconv3d = pytensor.function(
-        [],
-        [],
-        updates=[(s_filters, gfilters), (s_signals, gsignals)],
-        mode=mode,
-        name="grad",
-    )
-    check_diagonal_subtensor_view_traces(gnewconv3d)
-    # t0 = time.perf_counter()
-    gnewconv3d()
-    # print("grad", time.perf_counter() - t0)
-    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
-    Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2
-    rng = np.random.default_rng(280284)
-    signals = rng.random((Ns, Ts, C, Hs, Ws)).astype("float32")
-    filters = rng.random((Nf, Tf, C, Hf, Wf)).astype("float32")
-    utt.verify_grad(
-        lambda s, f: conv3d(s, f, border_mode=border_mode),
-        [signals, filters],
-        eps=1e-1,
-        mode=mode,
-    )
-    # Additional Test that covers the case of patched implementation for filter with Tf=1
-    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
-    Nf, Tf, C, Hf, Wf = 32, 1, 3, 5, 5
-    signals = (
-        np.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype("float32")
-    )
-    filters = (
-        np.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype("float32")
-    )
-    # t0 = time.perf_counter()
-    pyres = pyconv3d(signals, filters, border_mode)
-    # print(time.perf_counter() - t0)
-    s_signals = shared(signals)
-    s_filters = shared(filters)
-    s_output = shared(signals * 0)
-    out = conv3d(
-        s_signals,
-        s_filters,
-        signals_shape=signals.shape,
-        filters_shape=filters.shape,
-        border_mode=border_mode,
-    )
-    newconv3d = pytensor.function([], [], updates={s_output: out}, mode=mode)
-    # t0 = time.perf_counter()
-    newconv3d()
-    # print(time.perf_counter() - t0)
-    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
-    gsignals, gfilters = pytensor.grad(out.sum(), [s_signals, s_filters])
-    gnewconv3d = pytensor.function(
-        [],
-        [],
-        updates=[(s_filters, gfilters), (s_signals, gsignals)],
-        mode=mode,
-        name="grad",
-    )
-    # t0 = time.perf_counter()
-    gnewconv3d()
-    # print("grad", time.perf_counter() - t0)
-    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
-    Nf, Tf, C, Hf, Wf = 4, 1, 3, 2, 2
-    signals = rng.random((Ns, Ts, C, Hs, Ws)).astype("float32")
-    filters = rng.random((Nf, Tf, C, Hf, Wf)).astype("float32")
-    utt.verify_grad(
-        lambda s, f: conv3d(s, f, border_mode=border_mode),
-        [signals, filters],
-        eps=1e-1,
-        mode=mode,
-    )
--- a/tests/tensor/nnet/test_corr.py
+++ b/tests/tensor/nnet/test_corr.py
-import numpy as np
-import pytest
-import pytensor
-import pytensor.tensor as at
-from pytensor.tensor.nnet import corr
-from pytensor.tensor.type import dmatrix, dtensor3, dtensor4, dvector, tensor4
-from tests import unittest_tools as utt
-from tests.tensor.nnet.test_abstract_conv import (
-    TestAsymmetricPadding,
-    TestCausalConv,
-    TestGroupedConvNoOptim,
-    TestUnsharedConv,
-)
-@pytest.mark.skipif(
-    pytensor.config.cxx == "",
-    reason="SciPy and cxx needed",
-)
-class TestCorr2D(utt.InferShapeTester):
-    if pytensor.config.mode == "FAST_COMPILE":
-        mode = pytensor.compile.get_mode("FAST_RUN")
-    else:
-        mode = None
-    dtype = pytensor.config.floatX
-    def setup_method(self):
-        self.input = tensor4("input", dtype=self.dtype)
-        self.input.name = "default_V"
-        self.filters = tensor4("filters", dtype=self.dtype)
-        self.filters.name = "default_filters"
-        # This tests can run even when pytensor.config.blas__ldflags is empty.
-        super().setup_method()
-    def validate(
-        self,
-        image_shape,
-        filter_shape,
-        border_mode="valid",
-        subsample=(1, 1),
-        input=None,
-        filters=None,
-        verify_grad=True,
-        non_contiguous=False,
-        filter_dilation=(1, 1),
-    ):
-        """
-        :param image_shape: The constant shape info passed to corrMM.
-        :param filter_shape: The constant shape info passed to corrMM.
-        """
-        if not pytensor.config.cxx:
-            pytest.skip("Need cxx to test conv2d")
-        N_image_shape = [
-            at.get_scalar_constant_value(at.as_tensor_variable(x)) for x in image_shape
-        ]
-        N_filter_shape = [
-            at.get_scalar_constant_value(at.as_tensor_variable(x)) for x in filter_shape
-        ]
-        if input is None:
-            input = self.input
-        if filters is None:
-            filters = self.filters
-        # PYTENSOR IMPLEMENTATION
-        # we create a symbolic function so that verify_grad can work
-        def sym_CorrMM(input, filters):
-            # define pytensor graph and function
-            input.name = "input"
-            filters.name = "filters"
-            rval = corr.CorrMM(border_mode, subsample, filter_dilation)(input, filters)
-            rval.name = "corr_output"
-            return rval
-        output = sym_CorrMM(input, filters)
-        output.name = f"CorrMM()({input.name},{filters.name})"
-        pytensor_corr = pytensor.function([input, filters], output, mode=self.mode)
-        # initialize input and compute result
-        image_data = np.random.random(N_image_shape).astype(self.dtype)
-        filter_data = np.random.random(N_filter_shape).astype(self.dtype)
-        if non_contiguous:
-            image_data = np.transpose(image_data, axes=(0, 1, 3, 2))
-            image_data = image_data.copy()
-            image_data = np.transpose(image_data, axes=(0, 1, 3, 2))
-            filter_data = np.transpose(filter_data, axes=(0, 1, 3, 2))
-            filter_data = filter_data.copy()
-            filter_data = np.transpose(filter_data, axes=(0, 1, 3, 2))
-            assert not image_data.flags["CONTIGUOUS"]
-            assert not filter_data.flags["CONTIGUOUS"]
-        pytensor_output = pytensor_corr(image_data, filter_data)
-        # REFERENCE IMPLEMENTATION
-        # Testing correlation, not convolution. Reverse filters.
-        filter_data_corr = np.array(filter_data[:, :, ::-1, ::-1], copy=True, order="C")
-        orig_image_data = image_data
-        img_shape2d = np.array(N_image_shape[-2:])
-        fil_shape2d = np.array(N_filter_shape[-2:])
-        dil_shape2d = np.array(filter_dilation)
-        dil_fil_shape2d = (fil_shape2d - 1) * dil_shape2d + 1
-        subsample2d = np.array(subsample)
-        if border_mode == "full":
-            padHW = dil_fil_shape2d - 1
-        elif border_mode == "valid":
-            padHW = np.array([0, 0])
-        elif border_mode == "half":
-            padHW = np.floor(dil_fil_shape2d / 2).astype("int32")
-        elif isinstance(border_mode, tuple):
-            padHW = np.array(border_mode)
-        elif isinstance(border_mode, int):
-            padHW = np.array([border_mode, border_mode])
-        else:
-            raise NotImplementedError(f"Unsupported border_mode {border_mode}")
-        out_shape2d = (
-            np.floor((img_shape2d + 2 * (padHW) - dil_fil_shape2d) / subsample2d) + 1
-        )
-        # avoid numpy deprecation
-        out_shape2d = out_shape2d.astype("int32")
-        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d)
-        ref_output = np.zeros(out_shape)
-        # loop over output feature maps
-        ref_output.fill(0)
-        image_data2 = np.zeros(
-            (
-                N_image_shape[0],
-                N_image_shape[1],
-                N_image_shape[2] + 2 * padHW[0],
-                N_image_shape[3] + 2 * padHW[1],
-            )
-        )
-        image_data2[
-            :,
-            :,
-            padHW[0] : padHW[0] + N_image_shape[2],
-            padHW[1] : padHW[1] + N_image_shape[3],
-        ] = image_data
-        image_data = image_data2
-        N_image_shape = image_data.shape
-        for bb in range(N_image_shape[0]):
-            for nn in range(N_filter_shape[0]):
-                for im0 in range(N_image_shape[1]):
-                    filter2d = filter_data_corr[nn, im0, :, :]
-                    image2d = image_data[bb, im0, :, :]
-                    for row in range(ref_output.shape[2]):
-                        irow = row * subsample[0]  # image row
-                        for col in range(ref_output.shape[3]):
-                            icol = col * subsample[1]  # image col
-                            ref_output[bb, nn, row, col] += (
-                                image2d[
-                                    irow : irow
-                                    + dil_fil_shape2d[0] : filter_dilation[0],
-                                    icol : icol
-                                    + dil_fil_shape2d[1] : filter_dilation[1],
-                                ]
-                                * filter2d[::-1, ::-1]
-                            ).sum()
-        utt.assert_allclose(ref_output, pytensor_output)
-        # TEST GRADIENT
-        if verify_grad:
-            utt.verify_grad(sym_CorrMM, [orig_image_data, filter_data], mode=self.mode)
-    @pytest.mark.slow
-    def test_basic(self):
-        # Tests that basic correlations work for odd and even
-        # dimensions of image and filter shapes, as well as rectangular
-        # images and filters.
-        border_modes = ["valid", "full", "half", (1, 1), (2, 1), (1, 2), (3, 3), 1]
-        img_shapes = [
-            (2, 2, 3, 3),
-            (3, 2, 8, 8),
-            (3, 2, 7, 5),
-            (3, 2, 7, 5),
-            (3, 2, 8, 8),
-            (3, 2, 7, 5),
-        ]
-        fil_shapes = [
-            (2, 2, 2, 2),
-            (4, 2, 5, 5),
-            (5, 2, 2, 3),
-            (5, 2, 3, 2),
-            (4, 2, 5, 5),
-            (5, 2, 2, 3),
-        ]
-        for border_mode in border_modes:
-            for img, fil in zip(img_shapes, fil_shapes):
-                self.validate(img, fil, border_mode, verify_grad=False)
-        # Very slow on with 'full' or 'half'
-        self.validate((1, 10, 213, 129), (46, 10, 212, 1), "valid", verify_grad=False)
-    def test_img_kernel_same_shape(self):
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), "full")
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), "valid")
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), "half")
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), (1, 1))
-        self.validate((3, 2, 3, 3), (4, 2, 3, 3), 1)
-    @pytest.mark.slow
-    def test_subsample(self):
-        # Tests correlation where subsampling != (1,1)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", subsample=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", subsample=(2, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), "valid", subsample=(3, 3))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", subsample=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", subsample=(2, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), "full", subsample=(3, 3))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "half", subsample=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "half", subsample=(2, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), "half", subsample=(3, 3))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 1), subsample=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), subsample=(2, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), (1, 2), subsample=(3, 3))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3))
-    def test_filter_dilation(self):
-        # Tests correlation where filter dilation != (1,1)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", filter_dilation=(2, 2))
-        self.validate((3, 2, 14, 10), (5, 2, 2, 3), "valid", filter_dilation=(3, 1))
-        self.validate((1, 1, 14, 14), (1, 1, 3, 3), "valid", filter_dilation=(2, 3))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", filter_dilation=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", filter_dilation=(3, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), "full", filter_dilation=(2, 3))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "half", filter_dilation=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "half", filter_dilation=(3, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), "half", filter_dilation=(2, 3))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 1), filter_dilation=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), filter_dilation=(2, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), (1, 2), filter_dilation=(1, 2))
-        self.validate(
-            (1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3), filter_dilation=(2, 2)
-        )
-    @pytest.mark.slow
-    def test_shape_Constant_tensor(self):
-        # Tests correlation where the {image,filter}_shape is a Constant tensor.
-        as_t = at.as_tensor_variable
-        border_modes = ["valid", "full", "half", (1, 1), (2, 1), (1, 2), (3, 3), 1]
-        for border_mode in border_modes:
-            self.validate(
-                (as_t(3), as_t(2), as_t(7), as_t(5)), (5, 2, 2, 3), border_mode
-            )
-            self.validate(as_t([3, 2, 7, 5]), (5, 2, 2, 3), border_mode)
-            self.validate(as_t((3, 2, 7, 5)), (5, 2, 2, 3), border_mode)
-            self.validate((3, 2, 7, 5), (as_t(5), as_t(2), as_t(2), as_t(3)), "valid")
-            self.validate((3, 2, 7, 5), as_t([5, 2, 2, 3]), border_mode)
-            self.validate(as_t([3, 2, 7, 5]), as_t([5, 2, 2, 3]), border_mode)
-    def test_invalid_filter_shape(self):
-        # Tests scenario where filter_shape[1] != input_shape[1]
-        with pytest.raises(ValueError):
-            self.validate((3, 2, 8, 8), (4, 3, 5, 5), "valid")
-    def test_full_mode(self):
-        # Tests basic correlation in full mode and case where filter
-        # is larger than the input image.
-        self.validate((3, 2, 5, 5), (4, 2, 8, 8), "full")
-        def f():
-            self.validate((3, 2, 5, 5), (4, 2, 8, 8), "valid")
-        with pytest.raises(Exception):
-            f()
-    def test_wrong_input(self):
-        # Make sure errors are raised when image and kernel are not 4D tensors
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", input=dmatrix())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", filters=dvector())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", input=dtensor3())
-    @pytest.mark.skipif(not pytensor.config.cxx, reason="Need cxx for this test")
-    def test_dtype_upcast(self):
-        # Checks dtype upcast for CorrMM methods.
-        rng = np.random.default_rng(280284)
-        def rand(shape, dtype="float64"):
-            r = np.asarray(rng.random(shape), dtype=dtype)
-            return r * 2 - 1
-        ops = [corr.CorrMM, corr.CorrMM_gradWeights, corr.CorrMM_gradInputs]
-        a_shapes = [[4, 5, 6, 3], [1, 5, 6, 3], [1, 5, 6, 3]]
-        b_shapes = [[7, 5, 3, 2], [1, 5, 3, 1], [7, 1, 3, 1]]
-        dtypes = ["float32", "float64"]
-        for op, a_shape, b_shape in zip(ops, a_shapes, b_shapes):
-            for a_dtype in dtypes:
-                for b_dtype in dtypes:
-                    c_dtype = pytensor.scalar.upcast(a_dtype, b_dtype)
-                    a_tens = tensor4(dtype=a_dtype)
-                    b_tens = tensor4(dtype=b_dtype)
-                    a_tens_val = rand(a_shape, dtype=a_dtype)
-                    b_tens_val = rand(b_shape, dtype=b_dtype)
-                    c_tens = op()(a_tens, b_tens)
-                    f = pytensor.function([a_tens, b_tens], c_tens, mode=self.mode)
-                    assert f(a_tens_val, b_tens_val).dtype == c_dtype
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        pytensor.config.cxx == "",
-        reason="SciPy and cxx needed",
-    )
-    def test_infer_shape_forward(self):
-        rng = np.random.default_rng(280284)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        corrMM = corr.CorrMM
-        adtens = dtensor4()
-        bdtens = dtensor4()
-        aivec_vals = [
-            [4, 5, 6, 3],
-            [6, 2, 8, 3],
-            [3, 6, 7, 5],
-            [3, 6, 7, 5],
-            [5, 2, 4, 3],
-        ]
-        bivec_vals = [
-            [7, 5, 3, 2],
-            [4, 2, 5, 3],
-            [5, 6, 3, 2],
-            [5, 6, 2, 3],
-            [6, 2, 4, 3],
-        ]
-        modes = ["valid", "full", "half", (1, 1), (2, 1), (1, 2), 1]
-        subsamples = [(1, 1), (2, 1), (1, 2)]
-        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
-            adtens_val = rand(*aivec_val)
-            bdtens_val = rand(*bivec_val)
-            for mode in modes:
-                for subsample in subsamples:
-                    # CorrMM
-                    cdtens = corrMM(border_mode=mode, subsample=subsample)(
-                        adtens, bdtens
-                    )
-                    self._compile_and_check(
-                        [adtens, bdtens],
-                        [cdtens],
-                        [adtens_val, bdtens_val],
-                        corrMM,
-                        warn=False,
-                    )
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        pytensor.config.mode == "FAST_COMPILE" or pytensor.config.cxx == "",
-        reason="SciPy and cxx needed",
-    )
-    def test_infer_shape_gradW(self):
-        rng = np.random.default_rng(280284)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        corrMM = corr.CorrMM
-        gradW = corr.CorrMM_gradWeights
-        adtens = dtensor4()
-        bdtens = dtensor4()
-        aivec_vals = [
-            [1, 5, 6, 3],
-            [8, 2, 7, 3],
-            [1, 6, 9, 4],
-            [9, 6, 8, 5],
-            [9, 1, 6, 8],
-        ]
-        bivec_vals = [
-            [7, 5, 3, 1],
-            [4, 2, 5, 3],
-            [12, 6, 3, 2],
-            [5, 6, 1, 3],
-            [11, 1, 3, 3],
-        ]
-        modes = ["valid", "full", "half", (1, 1), (2, 1), (1, 2), 1]
-        subsamples = [(1, 1), (2, 1), (1, 2)]
-        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
-            adtens_val = rand(*aivec_val)
-            bdtens_val = rand(*bivec_val)
-            for mode in modes:
-                for subsample in subsamples:
-                    # CorrMM
-                    cdtens = corrMM(border_mode=mode, subsample=subsample)(
-                        adtens, bdtens
-                    )
-                    f = pytensor.function([adtens, bdtens], cdtens)
-                    cdtens_val = f(adtens_val, bdtens_val)
-                    # CorrMM_gradWeights
-                    shape = (
-                        pytensor.shared(bivec_val[2]),
-                        pytensor.shared(bivec_val[3]),
-                    )
-                    bdtens_g = gradW(border_mode=mode, subsample=subsample)(
-                        adtens, cdtens, shape=shape
-                    )
-                    self._compile_and_check(
-                        [adtens, cdtens],
-                        [bdtens_g],
-                        [adtens_val, cdtens_val],
-                        gradW,
-                        warn=False,
-                    )
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        pytensor.config.mode == "FAST_COMPILE" or not pytensor.config.cxx,
-        reason="Need cxx for this test",
-    )
-    def test_infer_shape_gradI(self):
-        rng = np.random.default_rng(280284)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        corrMM = corr.CorrMM
-        gradI = corr.CorrMM_gradInputs
-        adtens = dtensor4()
-        bdtens = dtensor4()
-        aivec_vals = [
-            [1, 5, 6, 3],
-            [8, 2, 7, 3],
-            [1, 6, 9, 4],
-            [9, 6, 8, 5],
-            [9, 1, 6, 8],
-        ]
-        bivec_vals = [
-            [7, 5, 3, 1],
-            [4, 2, 5, 3],
-            [12, 6, 3, 2],
-            [5, 6, 1, 3],
-            [7, 1, 3, 4],
-        ]
-        modes = ["valid", "full", "half", (1, 1), (2, 1), (1, 2), 1]
-        subsamples = [(1, 1), (2, 1), (1, 2)]
-        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
-            adtens_val = rand(*aivec_val)
-            bdtens_val = rand(*bivec_val)
-            for mode in modes:
-                for subsample in subsamples:
-                    # CorrMM
-                    cdtens = corrMM(border_mode=mode, subsample=subsample)(
-                        adtens, bdtens
-                    )
-                    f = pytensor.function([adtens, bdtens], cdtens)
-                    cdtens_val = f(adtens_val, bdtens_val)
-                    # CorrMM_gradInputs
-                    shape = (
-                        pytensor.shared(aivec_val[2]),
-                        pytensor.shared(aivec_val[3]),
-                    )
-                    adtens_g = gradI(border_mode=mode, subsample=subsample)(
-                        bdtens, cdtens, shape=shape
-                    )
-                    self._compile_and_check(
-                        [bdtens, cdtens],
-                        [adtens_g],
-                        [bdtens_val, cdtens_val],
-                        gradI,
-                        warn=False,
-                    )
-    def test_non_contiguous(self):
-        self.validate((2, 2, 3, 3), (2, 2, 2, 2), "valid", non_contiguous=True)
-        self.validate((3, 2, 8, 8), (4, 2, 5, 5), "valid", non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "valid", non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 3, 2), "valid", non_contiguous=True)
-        self.validate((3, 2, 8, 8), (4, 2, 5, 5), "full", non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "full", non_contiguous=True)
-        self.validate((3, 2, 8, 8), (4, 2, 5, 5), "half", non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), "half", non_contiguous=True)
-        self.validate((3, 2, 8, 8), (4, 2, 5, 5), (1, 1), non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 2), non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), non_contiguous=True)
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 2, non_contiguous=True)
-class TestGroupCorr2d(TestGroupedConvNoOptim):
-    mode = pytensor.compile.get_mode("FAST_RUN").excluding("gpuarray")
-    conv_op = corr.CorrMM
-    conv_gradw_op = corr.CorrMM_gradWeights
-    conv_gradi_op = corr.CorrMM_gradInputs
-    def test_graph(self):
-        # define common values  first
-        groups = 3
-        rng = np.random.default_rng(280284)
-        bottom = rng.random((3, 6, 5, 5)).astype(pytensor.config.floatX)
-        kern = rng.random((9, 2, 3, 3)).astype(pytensor.config.floatX)
-        bottom_sym = tensor4("bottom")
-        kern_sym = tensor4("kern")
-        # grouped convolution graph
-        conv_group = self.conv(num_groups=groups)(bottom_sym, kern_sym)
-        gconv_func = pytensor.function(
-            [bottom_sym, kern_sym], conv_group, mode=self.mode
-        )
-        # Graph for the normal hard way
-        kern_offset = kern_sym.shape[0] // groups
-        bottom_offset = bottom_sym.shape[1] // groups
-        split_conv_output = [
-            self.conv()(
-                bottom_sym[:, i * bottom_offset : (i + 1) * bottom_offset, :, :],
-                kern_sym[i * kern_offset : (i + 1) * kern_offset, :, :, :],
-            )
-            for i in range(groups)
-        ]
-        concatenated_output = at.concatenate(split_conv_output, axis=1)
-        conv_func = pytensor.function(
-            [bottom_sym, kern_sym], concatenated_output, mode=self.mode
-        )
-        # calculate outputs for each graph
-        gconv_output = gconv_func(bottom, kern)
-        conv_output = conv_func(bottom, kern)
-        # compare values
-        utt.assert_allclose(gconv_output, conv_output)
-class TestUnsharedCorr2d(TestUnsharedConv):
-    if pytensor.config.mode == "FAST_COMPILE":
-        mode = pytensor.compile.get_mode("FAST_RUN").excluding("gpuarray")
-    else:
-        mode = None
-    conv2d_op = corr.CorrMM
-    conv2d_gradw_op = corr.CorrMM_gradWeights
-    conv2d_gradi_op = corr.CorrMM_gradInputs
-class TestAsymmetricCorr(TestAsymmetricPadding):
-    if pytensor.config.mode == "FAST_COMPILE":
-        mode = pytensor.compile.get_mode("FAST_RUN").excluding("gpuarray")
-    else:
-        mode = None
-    conv2d_op = corr.CorrMM
-    conv2d_gradw_op = corr.CorrMM_gradWeights
-    conv2d_gradi_op = corr.CorrMM_gradInputs
-class TestCausalCorr(TestCausalConv):
-    if pytensor.config.mode == "FAST_COMPILE":
-        mode = pytensor.compile.get_mode("FAST_RUN").excluding("gpuarray")
-    else:
-        mode = None
--- a/tests/tensor/nnet/test_corr3d.py
+++ b/tests/tensor/nnet/test_corr3d.py
-import numpy as np
-import pytest
-import pytensor
-import pytensor.tensor as at
-from pytensor.tensor.nnet import corr3d
-from pytensor.tensor.type import dmatrix, dtensor3, dtensor4, dtensor5, tensor5, vector
-from tests import unittest_tools as utt
-from tests.tensor.nnet.test_abstract_conv import TestGroupedConv3dNoOptim
-@pytest.mark.skipif(
-    pytensor.config.cxx == "",
-    reason="SciPy and cxx needed",
-)
-class TestCorr3D(utt.InferShapeTester):
-    if pytensor.config.mode == "FAST_COMPILE":
-        mode = pytensor.compile.get_mode("FAST_RUN")
-    else:
-        mode = None
-    dtype = pytensor.config.floatX
-    def setup_method(self):
-        self.input = tensor5("input", dtype=self.dtype)
-        self.input.name = "default_V"
-        self.filters = tensor5("filters", dtype=self.dtype)
-        self.filters.name = "default_filters"
-        # This tests can run even when pytensor.config.blas__ldflags is empty.
-        super().setup_method()
-    def validate(
-        self,
-        image_shape,
-        filter_shape,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        input=None,
-        filters=None,
-        verify_grad=True,
-        non_contiguous=False,
-        filter_dilation=(1, 1, 1),
-    ):
-        """
-        :param image_shape: The constant shape info passed to corr3dMM.
-        :param filter_shape: The constant shape info passed to corr3dMM.
-        """
-        if not pytensor.config.cxx:
-            pytest.skip("Need cxx for this test")
-        N_image_shape = [
-            at.get_scalar_constant_value(at.as_tensor_variable(x)) for x in image_shape
-        ]
-        N_filter_shape = [
-            at.get_scalar_constant_value(at.as_tensor_variable(x)) for x in filter_shape
-        ]
-        if input is None:
-            input = self.input
-        if filters is None:
-            filters = self.filters
-        # PYTENSOR IMPLEMENTATION
-        # we create a symbolic function so that verify_grad can work
-        def sym_Corr3dMM(input, filters):
-            # define pytensor graph and function
-            input.name = "input"
-            filters.name = "filters"
-            rval = corr3d.Corr3dMM(border_mode, subsample, filter_dilation)(
-                input, filters
-            )
-            rval.name = "corr_output"
-            return rval
-        output = sym_Corr3dMM(input, filters)
-        output.name = f"Corr3dMM()({input.name},{filters.name})"
-        pytensor_corr = pytensor.function([input, filters], output, mode=self.mode)
-        # initialize input and compute result
-        rng = np.random.default_rng(28483)
-        image_data = rng.random(N_image_shape).astype(self.dtype)
-        filter_data = rng.random(N_filter_shape).astype(self.dtype)
-        image_data /= 10
-        filter_data /= 10
-        if non_contiguous:
-            image_data = np.transpose(image_data, axes=(0, 1, 4, 3, 2))
-            image_data = image_data.copy()
-            image_data = np.transpose(image_data, axes=(0, 1, 4, 3, 2))
-            filter_data = np.transpose(filter_data, axes=(0, 1, 4, 3, 2))
-            filter_data = filter_data.copy()
-            filter_data = np.transpose(filter_data, axes=(0, 1, 4, 3, 2))
-            assert not image_data.flags["CONTIGUOUS"]
-            assert not filter_data.flags["CONTIGUOUS"]
-        pytensor_output = pytensor_corr(image_data, filter_data)
-        # REFERENCE IMPLEMENTATION
-        # Testing correlation, not convolution. Reverse filters.
-        filter_data_corr = np.array(
-            filter_data[:, :, ::-1, ::-1, ::-1], copy=True, order="C"
-        )
-        orig_image_data = image_data
-        img_shape3d = np.array(N_image_shape[-3:])
-        fil_shape3d = np.array(N_filter_shape[-3:])
-        dil_shape3d = np.array(filter_dilation)
-        dil_fil_shape3d = (fil_shape3d - 1) * dil_shape3d + 1
-        subsample3d = np.array(subsample)
-        if border_mode == "full":
-            padHWD = dil_fil_shape3d - 1
-        elif border_mode == "valid":
-            padHWD = np.array([0, 0, 0])
-        elif border_mode == "half":
-            padHWD = np.floor(dil_fil_shape3d / 2).astype("int32")
-        elif isinstance(border_mode, tuple):
-            padHWD = np.array(border_mode)
-        elif isinstance(border_mode, int):
-            padHWD = np.array([border_mode, border_mode, border_mode])
-        else:
-            raise NotImplementedError(f"Unsupported border_mode {border_mode}")
-        out_shape3d = (
-            np.floor((img_shape3d + 2 * (padHWD) - dil_fil_shape3d) / subsample3d) + 1
-        )
-        # avoid numpy deprecation
-        out_shape3d = out_shape3d.astype("int32")
-        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape3d)
-        ref_output = np.zeros(out_shape)
-        # loop over output feature maps
-        ref_output.fill(0)
-        image_data2 = np.zeros(
-            (
-                N_image_shape[0],
-                N_image_shape[1],
-                N_image_shape[2] + 2 * padHWD[0],
-                N_image_shape[3] + 2 * padHWD[1],
-                N_image_shape[4] + 2 * padHWD[2],
-            )
-        )
-        image_data2[
-            :,
-            :,
-            padHWD[0] : padHWD[0] + N_image_shape[2],
-            padHWD[1] : padHWD[1] + N_image_shape[3],
-            padHWD[2] : padHWD[2] + N_image_shape[4],
-        ] = image_data
-        image_data = image_data2
-        N_image_shape = image_data.shape
-        for bb in range(N_image_shape[0]):
-            for nn in range(N_filter_shape[0]):
-                for im0 in range(N_image_shape[1]):
-                    filter3d = filter_data_corr[nn, im0, :, :, :]
-                    image3d = image_data[bb, im0, :, :, :]
-                    for row in range(ref_output.shape[2]):
-                        irow = row * subsample[0]  # image row
-                        for col in range(ref_output.shape[3]):
-                            icol = col * subsample[1]  # image col
-                            for slc in range(ref_output.shape[4]):
-                                islc = slc * subsample[2]  # image slice
-                                ref_output[bb, nn, row, col, slc] += (
-                                    image3d[
-                                        irow : irow
-                                        + dil_fil_shape3d[0] : filter_dilation[0],
-                                        icol : icol
-                                        + dil_fil_shape3d[1] : filter_dilation[1],
-                                        islc : islc
-                                        + dil_fil_shape3d[2] : filter_dilation[2],
-                                    ]
-                                    * filter3d[::-1, ::-1, ::-1]
-                                ).sum()
-        utt.assert_allclose(pytensor_output, ref_output)
-        # TEST GRADIENT
-        if verify_grad:
-            utt.verify_grad(
-                sym_Corr3dMM, [orig_image_data, filter_data], mode=self.mode
-            )
-    @pytest.mark.slow
-    def test_basic(self):
-        # Tests that basic correlations work for odd and even
-        # dimensions of image and filter shapes, as well as rectangular
-        # images and filters.
-        border_modes = [
-            "valid",
-            "full",
-            "half",
-            (1, 1, 1),
-            (2, 1, 1),
-            (1, 2, 1),
-            (1, 1, 2),
-            (3, 3, 3),
-            1,
-        ]
-        img_shapes = [
-            (2, 2, 3, 3, 3),
-            (3, 2, 8, 8, 8),
-            (3, 2, 7, 5, 5),
-            (3, 2, 7, 5, 5),
-            (1, 2, 8, 8, 8),
-            (1, 2, 7, 5, 5),
-        ]
-        fil_shapes = [
-            (2, 2, 2, 2, 2),
-            (1, 2, 5, 5, 5),
-            (2, 2, 2, 3, 2),
-            (2, 2, 3, 2, 2),
-            (1, 2, 5, 5, 5),
-            (1, 2, 2, 3, 3),
-        ]
-        for border_mode in border_modes:
-            for img, fil in zip(img_shapes, fil_shapes):
-                self.validate(img, fil, border_mode, verify_grad=False)
-        # Very slow on with 'full' or 'half'
-        self.validate((1, 2, 53, 29, 11), (13, 2, 12, 1, 1), "valid", verify_grad=False)
-    def test_img_kernel_same_shape(self):
-        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), "full")
-        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), "valid")
-        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), "half")
-        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), (1, 1, 1))
-        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 1)
-    @pytest.mark.slow
-    def test_subsample(self):
-        # Tests correlation where subsampling != (1,1,1)
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "valid", subsample=(2, 2, 2))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "valid", subsample=(2, 1, 1))
-        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), "valid", subsample=(3, 3, 3))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "full", subsample=(2, 2, 2))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "full", subsample=(2, 1, 1))
-        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), "full", subsample=(3, 3, 3))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "half", subsample=(2, 2, 2))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "half", subsample=(2, 1, 1))
-        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), "half", subsample=(3, 3, 3))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (1, 1, 1), subsample=(2, 2, 2))
-        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (2, 1, 1), subsample=(2, 1, 1))
-        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 2, 2), subsample=(3, 3, 3))
-        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 1, subsample=(3, 3, 3))
-    # Tests correlation where filter dilation != (1,1,1)
-    @pytest.mark.parametrize(
-        "image_shape, filter_shape, border_mode, filter_dilation",
-        [
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "valid", (2, 2, 2)),
-            ((3, 2, 14, 10, 10), (2, 2, 2, 3, 3), "valid", (3, 1, 1)),
-            ((1, 1, 14, 14, 14), (1, 1, 3, 3, 3), "valid", (2, 3, 3)),
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "full", (2, 2, 2)),
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "full", (3, 1, 1)),
-            ((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), "full", (2, 3, 3)),
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "half", (2, 2, 2)),
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), "half", (3, 1, 1)),
-            ((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), "half", (2, 3, 3)),
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (1, 1, 1), (2, 2, 2)),
-            ((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (2, 1, 1), (2, 1, 1)),
-            ((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 2, 1), (1, 2, 1)),
-            ((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 1, 2), (1, 1, 2)),
-        ],
-    )
-    def test_filter_dilation(
-        self, image_shape, filter_shape, border_mode, filter_dilation
-    ):
-        self.validate(
-            image_shape, filter_shape, border_mode, filter_dilation=filter_dilation
-        )
-    def test_filter_dilation_subsample(self):
-        self.validate(
-            (1, 1, 6, 6, 6),
-            (1, 1, 3, 3, 3),
-            1,
-            subsample=(3, 3, 3),
-            filter_dilation=(2, 2, 2),
-        )
-    @pytest.mark.parametrize(
-        "border_mode",
-        [
-            "valid",
-            "full",
-            "half",
-            (1, 1, 1),
-            (2, 1, 1),
-            (1, 2, 1),
-            (1, 1, 2),
-            (3, 3, 3),
-            1,
-        ],
-    )
-    def test_shape_Constant_tensor(self, border_mode):
-        # Tests correlation where the {image,filter}_shape is a Constant tensor
-        as_t = at.as_tensor_variable
-        self.validate(
-            (as_t(3), as_t(2), as_t(7), as_t(5), as_t(5)), (5, 2, 2, 3, 3), border_mode
-        )
-        self.validate(as_t([3, 2, 7, 5, 5]), (5, 2, 2, 3, 3), border_mode)
-        self.validate(as_t((3, 2, 7, 5, 5)), (5, 2, 2, 3, 3), border_mode)
-        self.validate(
-            (3, 2, 7, 5, 5), (as_t(5), as_t(2), as_t(2), as_t(3), as_t(3)), "valid"
-        )
-        self.validate((3, 2, 7, 5, 5), as_t([5, 2, 2, 3, 3]), border_mode)
-        self.validate(as_t([3, 2, 7, 5, 5]), as_t([5, 2, 2, 3, 3]), border_mode)
-    def test_invalid_filter_shape(self):
-        # Tests scenario where filter_shape[1] != input_shape[1]
-        with pytest.raises(ValueError):
-            self.validate((3, 2, 8, 8, 8), (4, 3, 5, 5, 8), "valid")
-    def test_full_mode(self):
-        # Tests basic correlation in full mode and case where filter
-        # is larger than the input image.
-        self.validate((3, 1, 4, 4, 4), (2, 1, 5, 5, 5), "full")
-        def f():
-            self.validate((3, 2, 5, 5, 5), (4, 2, 8, 8, 8), "valid")
-        with pytest.raises(Exception):
-            f()
-    def test_wrong_input(self):
-        # Make sure errors are raised when image and kernel are not 5D tensors
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), "valid", input=dmatrix())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), "valid", input=vector())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), "valid", input=dtensor3())
-        with pytest.raises(Exception):
-            self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), "valid", input=dtensor4())
-    @pytest.mark.skipif(not pytensor.config.cxx, reason="Need cxx for this test")
-    def test_dtype_upcast(self):
-        # Checks dtype upcast for Corr3dMM methods.
-        rng = np.random.default_rng(28483)
-        def rand(shape, dtype="float64"):
-            r = np.asarray(rng.random(shape), dtype=dtype)
-            return r * 2 - 1
-        ops = [corr3d.Corr3dMM, corr3d.Corr3dMMGradWeights, corr3d.Corr3dMMGradInputs]
-        a_shapes = [[4, 5, 6, 3, 3], [1, 5, 6, 3, 3], [1, 5, 6, 3, 3]]
-        b_shapes = [[7, 5, 3, 2, 2], [1, 5, 3, 1, 1], [7, 1, 3, 1, 1]]
-        dtypes = ["float32", "float64"]
-        for op, a_shape, b_shape in zip(ops, a_shapes, b_shapes):
-            for a_dtype in dtypes:
-                for b_dtype in dtypes:
-                    c_dtype = pytensor.scalar.upcast(a_dtype, b_dtype)
-                    a_tens = tensor5(dtype=a_dtype)
-                    b_tens = tensor5(dtype=b_dtype)
-                    a_tens_val = rand(a_shape, dtype=a_dtype)
-                    b_tens_val = rand(b_shape, dtype=b_dtype)
-                    c_tens = op()(a_tens, b_tens)
-                    f = pytensor.function([a_tens, b_tens], c_tens, mode=self.mode)
-                    assert f(a_tens_val, b_tens_val).dtype == c_dtype
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        pytensor.config.mode == "FAST_COMPILE" or not pytensor.config.cxx,
-        reason="Need cxx for this test",
-    )
-    def test_infer_shape_forward(self):
-        rng = np.random.default_rng(28483)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        corr3dMM = corr3d.Corr3dMM
-        adtens = dtensor5()
-        bdtens = dtensor5()
-        aivec_vals = [
-            [4, 5, 6, 3, 3],
-            [6, 2, 8, 3, 3],
-            [3, 6, 7, 5, 5],
-            [3, 6, 7, 5, 5],
-            [5, 2, 4, 3, 3],
-        ]
-        bivec_vals = [
-            [7, 5, 3, 2, 2],
-            [4, 2, 5, 3, 3],
-            [5, 6, 3, 2, 2],
-            [5, 6, 2, 3, 3],
-            [6, 2, 4, 3, 3],
-        ]
-        modes = ["valid", "full", "half", (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
-        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
-        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
-            adtens_val = rand(*aivec_val)
-            bdtens_val = rand(*bivec_val)
-            for mode in modes:
-                for subsample in subsamples:
-                    # Corr3dMM
-                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(
-                        adtens, bdtens
-                    )
-                    self._compile_and_check(
-                        [adtens, bdtens],
-                        [cdtens],
-                        [adtens_val, bdtens_val],
-                        corr3dMM,
-                        warn=False,
-                    )
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        pytensor.config.mode == "FAST_COMPILE" or not pytensor.config.cxx,
-        reason="Need cxx for this test",
-    )
-    def test_infer_shape_gradW(self):
-        rng = np.random.default_rng(28483)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        corr3dMM = corr3d.Corr3dMM
-        gradW = corr3d.Corr3dMMGradWeights
-        adtens = dtensor5()
-        bdtens = dtensor5()
-        aivec_vals = [
-            [1, 5, 6, 3, 3],
-            [8, 2, 7, 3, 3],
-            [1, 6, 9, 4, 4],
-            [9, 6, 8, 5, 5],
-            [9, 1, 6, 8, 8],
-        ]
-        bivec_vals = [
-            [7, 5, 3, 1, 1],
-            [4, 2, 5, 3, 3],
-            [12, 6, 3, 2, 2],
-            [5, 6, 1, 3, 3],
-            [11, 1, 3, 3, 3],
-        ]
-        modes = ["valid", "full", "half", (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
-        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
-        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
-            adtens_val = rand(*aivec_val)
-            bdtens_val = rand(*bivec_val)
-            for mode in modes:
-                for subsample in subsamples:
-                    # Corr3dMM
-                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(
-                        adtens, bdtens
-                    )
-                    f = pytensor.function([adtens, bdtens], cdtens)
-                    cdtens_val = f(adtens_val, bdtens_val)
-                    # Corr3dMM_gradWeights
-                    shape = (
-                        pytensor.shared(bivec_val[2]),
-                        pytensor.shared(bivec_val[3]),
-                        pytensor.shared(bivec_val[4]),
-                    )
-                    bdtens_g = gradW(border_mode=mode, subsample=subsample)(
-                        adtens, cdtens, shape=shape
-                    )
-                    self._compile_and_check(
-                        [adtens, cdtens],
-                        [bdtens_g],
-                        [adtens_val, cdtens_val],
-                        gradW,
-                        warn=False,
-                    )
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        pytensor.config.mode == "FAST_COMPILE" or not pytensor.config.cxx,
-        reason="Need cxx for this test",
-    )
-    def test_infer_shape_gradI(self):
-        rng = np.random.default_rng(28483)
-        def rand(*shape):
-            r = np.asarray(rng.random(shape), dtype="float64")
-            return r * 2 - 1
-        corr3dMM = corr3d.Corr3dMM
-        gradI = corr3d.Corr3dMMGradInputs
-        adtens = dtensor5()
-        bdtens = dtensor5()
-        aivec_vals = [
-            [1, 5, 6, 3, 3],
-            [8, 2, 7, 3, 3],
-            [1, 6, 9, 4, 4],
-            [9, 6, 8, 5, 5],
-            [9, 1, 6, 8, 8],
-        ]
-        bivec_vals = [
-            [7, 5, 3, 1, 1],
-            [4, 2, 5, 3, 3],
-            [12, 6, 3, 2, 2],
-            [5, 6, 1, 3, 3],
-            [7, 1, 3, 4, 4],
-        ]
-        modes = ["valid", "full", "half", (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
-        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
-        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
-            adtens_val = rand(*aivec_val)
-            bdtens_val = rand(*bivec_val)
-            for mode in modes:
-                for subsample in subsamples:
-                    # Corr3dMM
-                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(
-                        adtens, bdtens
-                    )
-                    f = pytensor.function([adtens, bdtens], cdtens)
-                    cdtens_val = f(adtens_val, bdtens_val)
-                    # Corr3dMM_gradInputs
-                    shape = (
-                        pytensor.shared(aivec_val[2]),
-                        pytensor.shared(aivec_val[3]),
-                        pytensor.shared(aivec_val[4]),
-                    )
-                    adtens_g = gradI(border_mode=mode, subsample=subsample)(
-                        bdtens, cdtens, shape=shape
-                    )
-                    self._compile_and_check(
-                        [bdtens, cdtens],
-                        [adtens_g],
-                        [bdtens_val, cdtens_val],
-                        gradI,
-                        warn=False,
-                    )
-    def test_non_contiguous(self):
-        self.validate((2, 2, 3, 3, 3), (2, 2, 2, 2, 2), "valid", non_contiguous=True)
-        self.validate((3, 2, 8, 8, 8), (2, 2, 5, 5, 5), "valid", non_contiguous=True)
-        self.validate((3, 2, 7, 5, 5), (3, 2, 2, 3, 3), "valid", non_contiguous=True)
-        self.validate((3, 2, 7, 5, 5), (3, 2, 3, 2, 2), "valid", non_contiguous=True)
-        self.validate((3, 1, 8, 8, 8), (2, 1, 5, 5, 5), "full", non_contiguous=True)
-        self.validate((3, 1, 8, 8, 8), (2, 1, 5, 5, 5), "half", non_contiguous=True)
-        self.validate((3, 1, 8, 8, 8), (2, 1, 5, 5, 5), (1, 1, 1), non_contiguous=True)
-        self.validate((3, 1, 7, 5, 5), (2, 1, 2, 3, 3), (1, 1, 2), non_contiguous=True)
-        self.validate((3, 1, 7, 5, 5), (2, 1, 2, 3, 3), (1, 2, 1), non_contiguous=True)
-        self.validate((3, 1, 7, 5, 5), (2, 1, 2, 3, 3), (2, 1, 1), non_contiguous=True)
-class TestGroupCorr3d(TestGroupedConv3dNoOptim):
-    mode = pytensor.compile.get_mode("FAST_RUN")
-    conv_op = corr3d.Corr3dMM
-    conv_gradw_op = corr3d.Corr3dMMGradWeights
-    conv_gradi_op = corr3d.Corr3dMMGradInputs
-    flip_filter = True
-    is_dnn = False
--- a/tests/tensor/nnet/test_ctc.py
+++ b/tests/tensor/nnet/test_ctc.py
-import numpy as np
-import pytest
-import pytensor
-import pytensor.tensor as at
-from pytensor.tensor.nnet.ctc import (
-    ConnectionistTemporalClassification,
-    ctc,
-    ctc_available,
-)
-from tests import unittest_tools as utt
-def setup_torch_case():
-    # Test obtained from Torch tutorial at:
-    # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
-    # Layout, from slowest to fastest changing dimension, is (time, batchSize, inputLayerSize)
-    activations = np.asarray(
-        [
-            [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
-            [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
-            [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]],
-        ],
-        dtype=np.float32,
-    )
-    # Duration of each sequence
-    activation_times = np.asarray([1, 3, 3], dtype=np.int32)
-    # Labels for each sequence
-    labels = np.asarray([[1, -1], [3, 3], [2, 3]], dtype=np.int32)
-    expected_costs = np.asarray(
-        [1.609437943, 7.355742931, 4.938849926], dtype=np.float32
-    )
-    grads = [
-        [
-            [0.2, -0.8, 0.2, 0.2, 0.2],
-            [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
-            [-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627],
-        ],
-        [
-            [0, 0, 0, 0, 0],
-            [-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627],
-            [-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627],
-        ],
-        [
-            [0, 0, 0, 0, 0],
-            [0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
-            [-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627],
-        ],
-    ]
-    expected_gradients = np.asarray(grads, dtype=np.float32)
-    return [activations, labels, activation_times, expected_costs, expected_gradients]
-def setup_ctc_case():
-    activations = np.asarray(
-        [
-            [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
-            [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]],
-        ],
-        dtype=np.float32,
-    )
-    activation_times = np.asarray([2, 2], dtype=np.int32)
-    labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
-    expected_costs = np.asarray([2.962858438, 3.053659201], dtype=np.float32)
-    grads = [
-        [
-            [0.177031219, -0.7081246376, 0.177031219, 0.177031219, 0.177031219],
-            [0.177031219, -0.8229685426, 0.291875124, 0.177031219, 0.177031219],
-        ],
-        [
-            [0.291875124, 0.177031219, -0.8229685426, 0.177031219, 0.177031219],
-            [0.1786672771, 0.1786672771, -0.7334594727, 0.1974578798, 0.1786672771],
-        ],
-    ]
-    expected_gradients = np.asarray(grads, dtype=np.float32)
-    return [activations, labels, activation_times, expected_costs, expected_gradients]
-def setup_grad_case():
-    activations = np.asarray(
-        [
-            [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
-            [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]],
-        ],
-        dtype=np.float32,
-    )
-    activation_times = np.asarray([2, 2], dtype=np.int32)
-    labels = np.asarray([[1, 2], [1, 2]], dtype=np.int32)
-    return [activations, labels, activation_times]
-@pytest.mark.skipif(
-    not ctc_available(), reason="Optional library warp-ctc not available"
-)
-@pytest.mark.skipif(
-    pytensor.config.mode == "FAST_COMPILE" or pytensor.config.cxx == "",
-    reason="We need a c compiler",
-)
-class TestCTC:
-    """
-    Test Baidu CTC wrapper implementation.
-    Expected values for costs and gradients are obtained through an external
-    C implementation, that uses the library directly.
-    """
-    def run_ctc(
-        self, activations, labels, input_length, expected_costs, expected_grads
-    ):
-        # Create symbolic variables
-        t_activations = pytensor.shared(activations, name="activations")
-        t_activation_times = pytensor.shared(input_length, name="activation_times")
-        t_labels = pytensor.shared(labels, name="labels")
-        t_cost = ctc(t_activations, t_labels, t_activation_times)
-        # Symbolic gradient of CTC cost
-        t_grad = at.grad(at.mean(t_cost), t_activations)
-        # Compile symbolic functions
-        train = pytensor.function([], [t_cost, t_grad])
-        cost, grad = train()
-        utt.assert_allclose(expected_grads / cost.shape[0], grad)
-        utt.assert_allclose(expected_costs, cost)
-        self.check_grads_disabled(t_activations, t_labels, t_activation_times)
-    def check_grads_disabled(self, activations, labels, input_length):
-        """
-        Check if optimization to disable gradients is working
-        """
-        ctc_cost = ctc(activations, labels, input_length)
-        ctc_function = pytensor.function([], [ctc_cost])
-        for node in ctc_function.maker.fgraph.apply_nodes:
-            if isinstance(node.op, ConnectionistTemporalClassification):
-                assert node.op.compute_grad is False
-    def test_torch_case(self):
-        (
-            activations,
-            labels,
-            input_length,
-            expected_costs,
-            expected_grads,
-        ) = setup_torch_case()
-        self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
-    def test_ctc(self):
-        (
-            activations,
-            labels,
-            input_length,
-            expected_costs,
-            expected_grads,
-        ) = setup_ctc_case()
-        self.run_ctc(activations, labels, input_length, expected_costs, expected_grads)
-    def test_verify_grad(self):
-        def ctc_op_functor(labels, in_lengths):
-            def wrapper(acts):
-                # Create auxiliary symbolic variables
-                t_activation_times = pytensor.shared(
-                    in_lengths, name="activation_times"
-                )
-                t_labels = pytensor.shared(labels, name="labels")
-                return ctc(acts, t_labels, t_activation_times)
-            return wrapper
-        activations, labels, activation_times = setup_grad_case()
-        ctc_op = ctc_op_functor(labels, activation_times)
-        utt.verify_grad(ctc_op, [activations])
--- a/tests/tensor/nnet/test_neighbours.py
+++ b/tests/tensor/nnet/test_neighbours.py
-import numpy as np
-import pytest
-import pytensor
-import pytensor.tensor as at
-from pytensor import function, shared
-from pytensor.configdefaults import config
-from pytensor.tensor import nnet
-from pytensor.tensor.nnet.neighbours import Images2Neibs, images2neibs, neibs2images
-from pytensor.tensor.type import dtensor4, ftensor4, ivector, matrix, tensor4
-from tests import unittest_tools
-mode_without_gpu = pytensor.compile.mode.get_default_mode().excluding("gpu")
-class TestImages2Neibs(unittest_tools.InferShapeTester):
-    mode = mode_without_gpu
-    op = Images2Neibs
-    dtypes = ["int64", "float32", "float64"]
-    def test_neibs(self):
-        for shape, pshape in [
-            ((10, 7, 18, 18), (2, 2)),
-            ((10, 7, 6, 18), (3, 2)),
-            ((5, 7, 66, 66), (33, 33)),
-            ((5, 7, 68, 66), (34, 33)),
-        ]:
-            for border in ["valid", "ignore_borders"]:
-                for dtype in self.dtypes:
-                    images = shared(
-                        np.arange(np.prod(shape), dtype=dtype).reshape(shape)
-                    )
-                    neib_shape = at.as_tensor_variable(pshape)
-                    f = function(
-                        [],
-                        images2neibs(images, neib_shape, mode=border),
-                        mode=self.mode,
-                    )
-                    # print images.get_value(borrow=True)
-                    neibs = f()
-                    # print neibs
-                    g = function(
-                        [],
-                        neibs2images(neibs, neib_shape, images.shape),
-                        mode=self.mode,
-                    )
-                    assert any(
-                        isinstance(node.op, self.op)
-                        for node in f.maker.fgraph.toposort()
-                    )
-                    # print g()
-                    assert np.allclose(images.get_value(borrow=True), g())
-    def test_neibs_manual(self):
-        shape = (2, 3, 4, 4)
-        for dtype in self.dtypes:
-            images = shared(np.arange(np.prod(shape), dtype=dtype).reshape(shape))
-            neib_shape = at.as_tensor_variable((2, 2))
-            for border in ["valid", "ignore_borders"]:
-                f = function(
-                    [], images2neibs(images, neib_shape, mode=border), mode=self.mode
-                )
-                assert any(
-                    isinstance(node.op, self.op) for node in f.maker.fgraph.toposort()
-                )
-                # print images.get_value(borrow=True)
-                neibs = f()
-                # print neibs
-                assert np.allclose(
-                    neibs,
-                    [
-                        [0, 1, 4, 5],
-                        [2, 3, 6, 7],
-                        [8, 9, 12, 13],
-                        [10, 11, 14, 15],
-                        [16, 17, 20, 21],
-                        [18, 19, 22, 23],
-                        [24, 25, 28, 29],
-                        [26, 27, 30, 31],
-                        [32, 33, 36, 37],
-                        [34, 35, 38, 39],
-                        [40, 41, 44, 45],
-                        [42, 43, 46, 47],
-                        [48, 49, 52, 53],
-                        [50, 51, 54, 55],
-                        [56, 57, 60, 61],
-                        [58, 59, 62, 63],
-                        [64, 65, 68, 69],
-                        [66, 67, 70, 71],
-                        [72, 73, 76, 77],
-                        [74, 75, 78, 79],
-                        [80, 81, 84, 85],
-                        [82, 83, 86, 87],
-                        [88, 89, 92, 93],
-                        [90, 91, 94, 95],
-                    ],
-                )
-                g = function(
-                    [], neibs2images(neibs, neib_shape, images.shape), mode=self.mode
-                )
-                assert np.allclose(images.get_value(borrow=True), g())
-    def test_neibs_manual_step(self):
-        shape = (2, 3, 5, 5)
-        for dtype in self.dtypes:
-            images = shared(
-                np.asarray(np.arange(np.prod(shape)).reshape(shape), dtype=dtype)
-            )
-            neib_shape = at.as_tensor_variable((3, 3))
-            neib_step = at.as_tensor_variable((2, 2))
-            for border in ["valid", "ignore_borders"]:
-                f = function(
-                    [],
-                    images2neibs(images, neib_shape, neib_step, mode=border),
-                    mode=self.mode,
-                )
-                neibs = f()
-                assert self.op in [type(node.op) for node in f.maker.fgraph.toposort()]
-                assert np.allclose(
-                    neibs,
-                    [
-                        [0, 1, 2, 5, 6, 7, 10, 11, 12],
-                        [2, 3, 4, 7, 8, 9, 12, 13, 14],
-                        [10, 11, 12, 15, 16, 17, 20, 21, 22],
-                        [12, 13, 14, 17, 18, 19, 22, 23, 24],
-                        [25, 26, 27, 30, 31, 32, 35, 36, 37],
-                        [27, 28, 29, 32, 33, 34, 37, 38, 39],
-                        [35, 36, 37, 40, 41, 42, 45, 46, 47],
-                        [37, 38, 39, 42, 43, 44, 47, 48, 49],
-                        [50, 51, 52, 55, 56, 57, 60, 61, 62],
-                        [52, 53, 54, 57, 58, 59, 62, 63, 64],
-                        [60, 61, 62, 65, 66, 67, 70, 71, 72],
-                        [62, 63, 64, 67, 68, 69, 72, 73, 74],
-                        [75, 76, 77, 80, 81, 82, 85, 86, 87],
-                        [77, 78, 79, 82, 83, 84, 87, 88, 89],
-                        [85, 86, 87, 90, 91, 92, 95, 96, 97],
-                        [87, 88, 89, 92, 93, 94, 97, 98, 99],
-                        [100, 101, 102, 105, 106, 107, 110, 111, 112],
-                        [102, 103, 104, 107, 108, 109, 112, 113, 114],
-                        [110, 111, 112, 115, 116, 117, 120, 121, 122],
-                        [112, 113, 114, 117, 118, 119, 122, 123, 124],
-                        [125, 126, 127, 130, 131, 132, 135, 136, 137],
-                        [127, 128, 129, 132, 133, 134, 137, 138, 139],
-                        [135, 136, 137, 140, 141, 142, 145, 146, 147],
-                        [137, 138, 139, 142, 143, 144, 147, 148, 149],
-                    ],
-                )
-                # neibs2images do not seam to support step != neib_shape
-                # g = function([], neibs2images(neibs, neib_shape, images.shape),
-                #             mode=self.mode)
-                # print g()
-                # assert numpy.allclose(images.get_value(borrow=True), g())
-    @config.change_flags(compute_test_value="off")
-    def test_neibs_bad_shape(self):
-        shape = (2, 3, 10, 10)
-        for dtype in self.dtypes:
-            images = shared(np.arange(np.prod(shape), dtype=dtype).reshape(shape))
-            for neib_shape in [(3, 2), (2, 3)]:
-                neib_shape = at.as_tensor_variable(neib_shape)
-                f = function([], images2neibs(images, neib_shape), mode=self.mode)
-                with pytest.raises(TypeError):
-                    f()
-                # Test that ignore border work in that case.
-                f = function(
-                    [],
-                    images2neibs(images, neib_shape, mode="ignore_borders"),
-                    mode=self.mode,
-                )
-                assert self.op in [type(node.op) for node in f.maker.fgraph.toposort()]
-                f()
-    def test_neibs_wrap_centered_step_manual(self):
-        expected1 = [
-            [24, 20, 21, 4, 0, 1, 9, 5, 6],
-            [21, 22, 23, 1, 2, 3, 6, 7, 8],
-            [23, 24, 20, 3, 4, 0, 8, 9, 5],
-            [9, 5, 6, 14, 10, 11, 19, 15, 16],
-            [6, 7, 8, 11, 12, 13, 16, 17, 18],
-            [8, 9, 5, 13, 14, 10, 18, 19, 15],
-            [19, 15, 16, 24, 20, 21, 4, 0, 1],
-            [16, 17, 18, 21, 22, 23, 1, 2, 3],
-            [18, 19, 15, 23, 24, 20, 3, 4, 0],
-        ]
-        expected2 = [
-            [24, 20, 21, 4, 0, 1, 9, 5, 6],
-            [22, 23, 24, 2, 3, 4, 7, 8, 9],
-            [14, 10, 11, 19, 15, 16, 24, 20, 21],
-            [12, 13, 14, 17, 18, 19, 22, 23, 24],
-        ]
-        expected3 = [
-            [19, 15, 16, 24, 20, 21, 4, 0, 1, 9, 5, 6, 14, 10, 11],
-            [17, 18, 19, 22, 23, 24, 2, 3, 4, 7, 8, 9, 12, 13, 14],
-            [9, 5, 6, 14, 10, 11, 19, 15, 16, 24, 20, 21, 4, 0, 1],
-            [7, 8, 9, 12, 13, 14, 17, 18, 19, 22, 23, 24, 2, 3, 4],
-        ]
-        expected4 = [
-            [23, 24, 20, 21, 22, 3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
-            [21, 22, 23, 24, 20, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
-            [13, 14, 10, 11, 12, 18, 19, 15, 16, 17, 23, 24, 20, 21, 22],
-            [11, 12, 13, 14, 10, 16, 17, 18, 19, 15, 21, 22, 23, 24, 20],
-        ]
-        expected5 = [
-            [24, 20, 21, 4, 0, 1, 9, 5, 6],
-            [22, 23, 24, 2, 3, 4, 7, 8, 9],
-            [9, 5, 6, 14, 10, 11, 19, 15, 16],
-            [7, 8, 9, 12, 13, 14, 17, 18, 19],
-            [19, 15, 16, 24, 20, 21, 4, 0, 1],
-            [17, 18, 19, 22, 23, 24, 2, 3, 4],
-        ]
-        expected6 = [
-            [24, 20, 21, 4, 0, 1, 9, 5, 6],
-            [21, 22, 23, 1, 2, 3, 6, 7, 8],
-            [23, 24, 20, 3, 4, 0, 8, 9, 5],
-            [14, 10, 11, 19, 15, 16, 24, 20, 21],
-            [11, 12, 13, 16, 17, 18, 21, 22, 23],
-            [13, 14, 10, 18, 19, 15, 23, 24, 20],
-        ]
-        # TODO test discontinuous image
-        for shp_idx, (shape, neib_shape, neib_step, expected) in enumerate(
-            [
-                [(7, 8, 5, 5), (3, 3), (2, 2), expected1],
-                [(7, 8, 5, 5), (3, 3), (3, 3), expected2],
-                [(7, 8, 5, 5), (5, 3), (3, 3), expected3],
-                [(7, 8, 5, 5), (3, 5), (3, 3), expected4],
-                [(80, 90, 5, 5), (3, 3), (2, 3), expected5],
-                [(1025, 9, 5, 5), (3, 3), (3, 2), expected6],
-                [(1, 1, 5, 1035), (3, 3), (3, 3), None],
-                [(1, 1, 1045, 5), (3, 3), (3, 3), None],
-            ]
-        ):
-            for dtype in self.dtypes:
-                images = shared(
-                    np.asarray(np.arange(np.prod(shape)).reshape(shape), dtype=dtype)
-                )
-                neib_shape = at.as_tensor_variable(neib_shape)
-                neib_step = at.as_tensor_variable(neib_step)
-                expected = np.asarray(expected)
-                f = function(
-                    [],
-                    images2neibs(images, neib_shape, neib_step, mode="wrap_centered"),
-                    mode=self.mode,
-                )
-                neibs = f()
-                if expected.size > 1:
-                    for i in range(shape[0] * shape[1]):
-                        assert np.allclose(
-                            neibs[
-                                i * expected.shape[0] : (i + 1) * expected.shape[0], :
-                            ],
-                            expected + 25 * i,
-                        ), "wrap_centered"
-                assert self.op in [type(node.op) for node in f.maker.fgraph.toposort()]
-                # g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode)
-                # TODO: why this is commented?
-                # assert numpy.allclose(images.get_value(borrow=True), g())
-    @pytest.mark.slow
-    def test_neibs_half_step_by_valid(self):
-        neib_shapes = ((3, 3), (3, 5), (5, 3))
-        for shp_idx, (shape, neib_step) in enumerate(
-            [
-                [(7, 8, 5, 5), (1, 1)],
-                [(7, 8, 5, 5), (2, 2)],
-                [(7, 8, 5, 5), (4, 4)],
-                [(7, 8, 5, 5), (1, 4)],
-                [(7, 8, 5, 5), (4, 1)],
-                [(80, 90, 5, 5), (1, 2)],
-                [(1025, 9, 5, 5), (2, 1)],
-                [(1, 1, 5, 1037), (2, 4)],
-                [(1, 1, 1045, 5), (4, 2)],
-            ]
-        ):
-            for neib_shape in neib_shapes:
-                for dtype in self.dtypes:
-                    x = pytensor.shared(np.random.standard_normal(shape).astype(dtype))
-                    extra = (neib_shape[0] // 2, neib_shape[1] // 2)
-                    padded_shape = (
-                        x.shape[0],
-                        x.shape[1],
-                        x.shape[2] + 2 * extra[0],
-                        x.shape[3] + 2 * extra[1],
-                    )
-                    padded_x = at.zeros(padded_shape)
-                    padded_x = at.set_subtensor(
-                        padded_x[:, :, extra[0] : -extra[0], extra[1] : -extra[1]], x
-                    )
-                    x_using_valid = images2neibs(
-                        padded_x, neib_shape, neib_step, mode="valid"
-                    )
-                    x_using_half = images2neibs(x, neib_shape, neib_step, mode="half")
-                    f_valid = pytensor.function([], x_using_valid, mode="FAST_RUN")
-                    f_half = pytensor.function([], x_using_half, mode=self.mode)
-                    unittest_tools.assert_allclose(f_valid(), f_half())
-    @pytest.mark.slow
-    def test_neibs_full_step_by_valid(self):
-        for shp_idx, (shape, neib_step, neib_shapes) in enumerate(
-            [
-                [(7, 8, 5, 5), (1, 1), ((3, 3), (3, 5), (5, 3))],
-                [(7, 8, 5, 5), (2, 2), ((3, 3), (3, 5), (5, 3))],
-                [(7, 8, 6, 6), (3, 3), ((2, 2), (2, 5), (5, 2))],
-                [(7, 8, 6, 6), (1, 3), ((2, 2), (2, 5), (5, 2))],
-                [(7, 8, 6, 6), (3, 1), ((2, 2), (2, 5), (5, 2))],
-                [(80, 90, 5, 5), (1, 2), ((3, 3), (3, 5), (5, 3))],
-                [(1025, 9, 5, 5), (2, 1), ((3, 3), (3, 5), (5, 3))],
-                [(1, 1, 11, 1037), (2, 3), ((3, 3), (5, 3))],
-                [(1, 1, 1043, 11), (3, 2), ((3, 3), (3, 5))],
-            ]
-        ):
-            for neib_shape in neib_shapes:
-                for dtype in self.dtypes:
-                    x = pytensor.shared(np.random.standard_normal(shape).astype(dtype))
-                    extra = (neib_shape[0] - 1, neib_shape[1] - 1)
-                    padded_shape = (
-                        x.shape[0],
-                        x.shape[1],
-                        x.shape[2] + 2 * extra[0],
-                        x.shape[3] + 2 * extra[1],
-                    )
-                    padded_x = at.zeros(padded_shape)
-                    padded_x = at.set_subtensor(
-                        padded_x[:, :, extra[0] : -extra[0], extra[1] : -extra[1]], x
-                    )
-                    x_using_valid = images2neibs(
-                        padded_x, neib_shape, neib_step, mode="valid"
-                    )
-                    x_using_full = images2neibs(x, neib_shape, neib_step, mode="full")
-                    f_valid = pytensor.function([], x_using_valid, mode="FAST_RUN")
-                    f_full = pytensor.function([], x_using_full, mode=self.mode)
-                    unittest_tools.assert_allclose(f_valid(), f_full())
-    @config.change_flags(compute_test_value="off")
-    def test_neibs_bad_shape_wrap_centered(self):
-        shape = (2, 3, 10, 10)
-        for dtype in self.dtypes:
-            images = shared(np.arange(np.prod(shape), dtype=dtype).reshape(shape))
-            for neib_shape in [(3, 2), (2, 3)]:
-                neib_shape = at.as_tensor_variable(neib_shape)
-                f = function(
-                    [],
-                    images2neibs(images, neib_shape, mode="wrap_centered"),
-                    mode=self.mode,
-                )
-                with pytest.raises(TypeError):
-                    f()
-            for shape in [(2, 3, 2, 3), (2, 3, 3, 2)]:
-                images = shared(np.arange(np.prod(shape)).reshape(shape))
-                neib_shape = at.as_tensor_variable((3, 3))
-                f = function(
-                    [],
-                    images2neibs(images, neib_shape, mode="wrap_centered"),
-                    mode=self.mode,
-                )
-                with pytest.raises(TypeError):
-                    f()
-            # Test a valid shapes
-            shape = (2, 3, 3, 3)
-            images = shared(np.arange(np.prod(shape)).reshape(shape))
-            neib_shape = at.as_tensor_variable((3, 3))
-            f = function(
-                [],
-                images2neibs(images, neib_shape, mode="wrap_centered"),
-                mode=self.mode,
-            )
-            f()
-    def test_grad_wrap_centered(self):
-        # It is not implemented for now. So test that we raise an error.
-        shape = (2, 3, 6, 6)
-        images_val = np.random.random(shape).astype("float32")
-        def fn(images):
-            return images2neibs(images, (3, 3), mode="wrap_centered")
-        with pytest.raises(TypeError):
-            unittest_tools.verify_grad(fn, [images_val], mode=self.mode)
-    def test_grad_half(self):
-        # It is not implemented for now. So test that we raise an error.
-        shape = (2, 3, 6, 6)
-        rng = np.random.default_rng(28483)
-        images_val = rng.random(shape).astype("float32")
-        def fn(images):
-            return images2neibs(images, (3, 3), mode="half")
-        with pytest.raises(TypeError):
-            unittest_tools.verify_grad(fn, [images_val], mode=self.mode)
-    def test_grad_full(self):
-        # It is not implemented for now. So test that we raise an error.
-        shape = (2, 3, 6, 6)
-        rng = np.random.default_rng(28483)
-        images_val = rng.random(shape).astype("float32")
-        def fn(images):
-            return images2neibs(images, (3, 3), mode="full")
-        with pytest.raises(TypeError):
-            unittest_tools.verify_grad(fn, [images_val], mode=self.mode)
-    def test_grad_valid(self):
-        shape = (2, 3, 6, 6)
-        rng = np.random.default_rng(28483)
-        images_val = rng.random(shape).astype("float32")
-        def fn(images):
-            return images2neibs(images, (2, 2))
-        unittest_tools.verify_grad(fn, [images_val], mode=self.mode, eps=0.1)
-        def fn(images):
-            return images2neibs(images, (3, 2), (1, 2))
-        unittest_tools.verify_grad(fn, [images_val], mode=self.mode, eps=0.1)
-        def fn(images):
-            return images2neibs(images, (1, 2), (5, 2))
-        unittest_tools.verify_grad(fn, [images_val], mode=self.mode, eps=0.1)
-    def test_grad_ignore_border(self):
-        shape = (2, 3, 5, 5)
-        rng = np.random.default_rng(28483)
-        images_val = rng.random(shape).astype("float32")
-        def fn(images):
-            return images2neibs(images, (2, 2), mode="ignore_borders")
-        unittest_tools.verify_grad(fn, [images_val], mode=self.mode, eps=0.1)
-    def test_neibs2images_grad(self):
-        # say we had images of size (2, 3, 10, 10)
-        # then we extracted 2x2 neighbors on this, we get (2 * 3 * 5 * 5, 4)
-        rng = np.random.default_rng(28483)
-        neibs_val = rng.random((150, 4))
-        def fn(neibs):
-            return neibs2images(neibs, (2, 2), (2, 3, 10, 10))
-        unittest_tools.verify_grad(fn, [neibs_val], mode=self.mode, eps=0.1)
-    def test_neibs_valid_with_inconsistent_borders(self):
-        shape = (2, 3, 5, 5)
-        images = dtensor4()
-        images_val = np.arange(np.prod(shape), dtype="float32").reshape(shape)
-        f = pytensor.function(
-            [images],
-            at.sqr(images2neibs(images, (2, 2), mode="valid")),
-            mode=self.mode,
-        )
-        with pytest.raises(TypeError):
-            f(images_val)
-    def test_neibs_half_with_inconsistent_borders(self):
-        shape = (2, 3, 5, 5)
-        images = dtensor4()
-        images_val = np.arange(np.prod(shape), dtype="float32").reshape(shape)
-        f = pytensor.function(
-            [images], at.sqr(images2neibs(images, (2, 2), mode="half")), mode=self.mode
-        )
-        with pytest.raises(TypeError):
-            f(images_val)
-    def test_neibs_full_with_inconsistent_borders(self):
-        shape = (2, 3, 5, 5)
-        images = dtensor4()
-        images_val = np.arange(np.prod(shape), dtype="float32").reshape(shape)
-        f = pytensor.function(
-            [images], at.sqr(images2neibs(images, (2, 2), mode="full")), mode=self.mode
-        )
-        with pytest.raises(TypeError):
-            f(images_val)
-    def test_can_not_infer_nb_dim(self):
-        # Was reported in gh-5613. Test that we do not crash
-        # or that we crash in a few other case found while
-        # investigating that case
-        img = tensor4("img")
-        patches = nnet.neighbours.images2neibs(img, [16, 16])
-        extractPatches = pytensor.function([img], patches, mode=self.mode)
-        patsRecovery = matrix("patsRecovery")
-        original_size = ivector("original_size")
-        for mode in ["valid", "ignore_borders"]:
-            out = neibs2images(patsRecovery, (16, 16), original_size, mode=mode)
-            f = pytensor.function([patsRecovery, original_size], out, mode=self.mode)
-            im_val = np.ones((1, 3, 320, 320), dtype=np.float32)
-            neibs = extractPatches(im_val)
-            # TODO FIXME: Make this a real test and `assert` something
-            f(neibs, im_val.shape)
-            # Wrong number of dimensions
-            with pytest.raises(ValueError):
-                f(neibs, (1, 1, 3, 320, 320))
-            # End up with a step of 0
-            # This can lead to division by zero in DebugMode
-            with pytest.raises((ValueError, ZeroDivisionError)):
-                f(neibs, (3, 320, 320, 1))
-    def speed_neibs(self):
-        shape = (100, 40, 18, 18)
-        images = shared(np.arange(np.prod(shape), dtype="float32").reshape(shape))
-        neib_shape = at.as_tensor_variable((3, 3))
-        f = function([], images2neibs(images, neib_shape), mode=self.mode)
-        for i in range(1000):
-            f()
-    def speed_neibs_wrap_centered(self):
-        shape = (100, 40, 18, 18)
-        images = shared(np.arange(np.prod(shape), dtype="float32").reshape(shape))
-        neib_shape = at.as_tensor_variable((3, 3))
-        f = function(
-            [], images2neibs(images, neib_shape, mode="wrap_centered"), mode=self.mode
-        )
-        for i in range(1000):
-            f()
-    def speed_neibs_half(self):
-        shape = (100, 40, 18, 18)
-        images = shared(np.arange(np.prod(shape), dtype="float32").reshape(shape))
-        neib_shape = at.as_tensor_variable((3, 3))
-        f = function([], images2neibs(images, neib_shape, mode="half"), mode=self.mode)
-        for i in range(1000):
-            f()
-    def speed_neibs_full(self):
-        shape = (100, 40, 18, 18)
-        images = shared(np.arange(np.prod(shape), dtype="float32").reshape(shape))
-        neib_shape = at.as_tensor_variable((3, 3))
-        f = function([], images2neibs(images, neib_shape, mode="full"), mode=self.mode)
-        for i in range(1000):
-            f()
-    def test_infer_shape(self):
-        shape = (100, 40, 6, 3)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 1), mode="valid")],
-            [images],
-            Images2Neibs,
-        )
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 3), mode="valid")],
-            [images],
-            Images2Neibs,
-        )
-        shape = (100, 40, 5, 4)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 1), mode="ignore_borders")],
-            [images],
-            Images2Neibs,
-        )
-        shape = (100, 40, 5, 3)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 3), mode="ignore_borders")],
-            [images],
-            Images2Neibs,
-        )
-        shape = (100, 40, 6, 7)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 2), mode="ignore_borders")],
-            [images],
-            Images2Neibs,
-        )
-        shape = (100, 40, 5, 10)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(3, 3), mode="wrap_centered")],
-            [images],
-            Images2Neibs,
-        )
-        shape = (100, 40, 6, 4)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 1), mode="half")],
-            [images],
-            Images2Neibs,
-        )
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 3), mode="half")],
-            [images],
-            Images2Neibs,
-        )
-        shape = (100, 40, 6, 5)
-        images = np.ones(shape).astype("float32")
-        x = ftensor4()
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 1), mode="full")],
-            [images],
-            Images2Neibs,
-        )
-        self._compile_and_check(
-            [x],
-            [images2neibs(x, neib_shape=(2, 3), mode="full")],
-            [images],
-            Images2Neibs,
-        )
--- a/tests/tensor/nnet/test_rewriting.py
+++ b/tests/tensor/nnet/test_rewriting.py
-import pytensor
-from pytensor.graph.rewriting.basic import check_stack_trace
-from pytensor.tensor.nnet.blocksparse import (
-    sparse_block_dot,
-    sparse_block_gemv,
-    sparse_block_gemv_inplace,
-    sparse_block_outer,
-    sparse_block_outer_inplace,
-)
-from pytensor.tensor.type import fmatrix, ftensor3, ftensor4, lmatrix
-from tests.unittest_tools import assertFailure_fast
-def test_blocksparse_inplace_gemv_opt():
-    b = fmatrix()
-    W = ftensor4()
-    h = ftensor3()
-    iIdx = lmatrix()
-    oIdx = lmatrix()
-    o = sparse_block_dot(W, h, iIdx, b, oIdx)
-    f = pytensor.function([W, h, iIdx, b, oIdx], o)
-    if pytensor.config.mode == "FAST_COMPILE":
-        assert not f.maker.fgraph.toposort()[-1].op.inplace
-        assert check_stack_trace(f, ops_to_check=[sparse_block_gemv])
-    else:
-        assert f.maker.fgraph.toposort()[-1].op.inplace
-        assert check_stack_trace(f, ops_to_check=[sparse_block_gemv_inplace])
-if pytensor.config.mode != "FAST_COMPILE":
-    test_blocksparse_inplace_gemv_opt = assertFailure_fast(
-        test_blocksparse_inplace_gemv_opt
-    )
-def test_blocksparse_inplace_outer_opt():
-    b = fmatrix()
-    W = ftensor4()
-    h = ftensor3()
-    iIdx = lmatrix()
-    oIdx = lmatrix()
-    o = sparse_block_dot(W, h, iIdx, b, oIdx)
-    f = pytensor.function(
-        [W, h, iIdx, b, oIdx], [o, pytensor.gradient.grad(o.sum(), wrt=W)]
-    )
-    if pytensor.config.mode == "FAST_COMPILE":
-        assert not f.maker.fgraph.toposort()[-1].op.inplace
-        assert check_stack_trace(f, ops_to_check=sparse_block_outer)
-    else:
-        assert f.maker.fgraph.toposort()[-1].op.inplace
-        assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
--- a/tests/tensor/nnet/test_sigm.py
+++ b/tests/tensor/nnet/test_sigm.py
-import numpy as np
-import pytest
-import pytensor
-from pytensor.compile.mode import get_default_mode, get_mode
-from pytensor.configdefaults import config
-from pytensor.graph.rewriting.basic import check_stack_trace
-from pytensor.scalar.basic import Composite
-from pytensor.tensor.elemwise import Elemwise
-from pytensor.tensor.inplace import sigmoid_inplace
-from pytensor.tensor.math import clip, sigmoid
-from pytensor.tensor.nnet.sigm import (
-    hard_sigmoid,
-    ultra_fast_scalar_sigmoid,
-    ultra_fast_sigmoid,
-    ultra_fast_sigmoid_inplace,
-)
-from pytensor.tensor.type import matrix
-from tests.tensor.utils import (
-    _good_broadcast_unary_normal_no_complex,
-    check_floatX,
-    copymod,
-    makeBroadcastTester,
-    upcast_int8_nfunc,
-)
-TestUltraFastSigmoidBroadcast = makeBroadcastTester(
-    op=ultra_fast_sigmoid,
-    expected=upcast_int8_nfunc(
-        lambda inputs: check_floatX(inputs, 1 / (1 + np.exp(-inputs)))
-    ),
-    good=copymod(
-        _good_broadcast_unary_normal_no_complex, without=["uint16"]
-    ),  # numpy fucnting overflows with uint16.
-    # grad=_grad_broadcast_unary_normal,
-    name="UltraFastSigmoidTester",
-    # This is an approx of the sigmoid. That is why we raise eps
-    eps=5e-2,
-)
-TestHardSigmoidBroadcast = makeBroadcastTester(
-    op=hard_sigmoid,
-    expected=upcast_int8_nfunc(
-        lambda inputs: check_floatX(inputs, 1 / (1 + np.exp(-inputs)))
-    ),
-    good=copymod(
-        _good_broadcast_unary_normal_no_complex, without=["uint16"]
-    ),  # numpy fucnting overflows with uint16.
-    # grad=_grad_broadcast_unary_normal,
-    name="HardSigmoidTester",
-    # This is an approx of the sigmoid. That is why we raise eps
-    eps=1e-1,
-)
-class TestSpecialSigmoidOpts:
-    def get_mode(self, excluding=None):
-        """
-        Return appropriate mode for the tests.
-        :param excluding: List of optimizations to exclude.
-        :return: The current default mode unless the `config.mode` option is
-        set to 'FAST_COMPILE' (in which case it is replaced by the 'FAST_RUN'
-        mode), without the optimizations specified in `excluding`.
-        """
-        if excluding is None:
-            excluding = []
-        m = config.mode
-        if m == "FAST_COMPILE":
-            mode = get_mode("FAST_RUN")
-        else:
-            mode = get_default_mode()
-        if excluding:
-            return mode.excluding(*excluding)
-        else:
-            return mode
-    def test_local_ultra_fast_sigmoid(self):
-        x = matrix("x")
-        s = sigmoid(x)
-        mode = self.get_mode("local_ultra_fast_sigmoid")
-        f = pytensor.function([x], s, mode=mode)
-        assert check_stack_trace(f, ops_to_check=sigmoid)
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 1
-        assert topo[0].op == sigmoid
-        mode = self.get_mode().including("local_ultra_fast_sigmoid")
-        f = pytensor.function([x], s, mode=mode)
-        assert check_stack_trace(f, ops_to_check=ultra_fast_sigmoid)
-        topo = f.maker.fgraph.toposort()
-        assert topo[0].op == ultra_fast_sigmoid
-        assert len(topo) == 1
-        s = sigmoid_inplace(x)
-        f = pytensor.function([x], s, mode=mode, accept_inplace=True)
-        assert check_stack_trace(f, ops_to_check=ultra_fast_sigmoid_inplace)
-        topo = f.maker.fgraph.toposort()
-        assert topo[0].op == ultra_fast_sigmoid_inplace
-        assert len(topo) == 1
-    @pytest.mark.skipif(config.cxx == "", reason="Needs a C compiler.")
-    def test_composite_c_code(self):
-        """Make sure this `Op`'s `c_code` works within a `Composite`."""
-        x = matrix("x")
-        mode = get_mode("FAST_RUN").including("local_ultra_fast_sigmoid")
-        f = pytensor.function([x], sigmoid(x) + sigmoid(x + 1), mode=mode)
-        topo = f.maker.fgraph.toposort()
-        assert isinstance(topo[0].op, Elemwise)
-        assert isinstance(topo[0].op.scalar_op, Composite)
-        assert ultra_fast_scalar_sigmoid in {
-            node.op for node in topo[0].op.scalar_op.fgraph.toposort()
-        }
-        assert len(topo) == 1
-    def test_local_hard_sigmoid(self):
-        x = matrix("x")
-        s = sigmoid(x)
-        mode = self.get_mode("local_hard_sigmoid")
-        f = pytensor.function([x], s, mode=mode)
-        assert check_stack_trace(f, ops_to_check=sigmoid)
-        topo = f.maker.fgraph.toposort()
-        assert topo[0].op == sigmoid
-        assert len(topo) == 1
-        mode = self.get_mode().including("local_hard_sigmoid")
-        f = pytensor.function([x], s, mode=mode)
-        topo = f.maker.fgraph.toposort()
-        assert not any(n.op == sigmoid for n in topo)
-        f([[-50, -10, -4, -1, 0, 1, 4, 10, 50]])
-        mode2 = mode.excluding("fusion").excluding("inplace")
-        f2 = pytensor.function([x], s, mode=mode2)
-        assert check_stack_trace(f2, ops_to_check=clip)
--- a/tests/tensor/test_misc.py
+++ b/tests/tensor/test_misc.py
-import copy
-import numpy as np
-from pytensor.compile.function import function
-from pytensor.compile.io import Out
-from pytensor.tensor.math import dot
-from pytensor.tensor.nnet import crossentropy_softmax_argmax_1hot_with_bias
-from pytensor.tensor.type import dmatrix, dvector, ivector, matrix
-def test_bug_2009_07_17_borrowed_output():
-    # Regression test for a bug where output was borrowed by mistake.
-    a = dmatrix()
-    b = dmatrix()
-    # The output should *NOT* be borrowed.
-    g = function([a, b], Out(dot(a, b), borrow=False))
-    x = np.zeros((1, 2))
-    y = np.ones((2, 5))
-    z = g(x, y)
-    # print(z)  # Should be zero.
-    x.fill(1)
-    # print(g(x, y))  # Should be non-zero.
-    # print(z)  # Should still be zero.
-    assert np.linalg.norm(z) == 0
-    # The code above was supposed to fail when it was written (or, more
-    # accurately, on the next revision, i.e. when it was merged with the
-    # rest of the code, i.e. on revision cac9c9e9f08e).
-    # However, for some reason, it does not fail anymore when at this revision.
-    # Thus, a new test (below) was added that exhibits the same issue. Note
-    # that it may better be moved into the test_nnet.py test file if it turns
-    # out the bug was caused by 'crossentropy_softmax_argmax_1hot_with_bias',
-    # and was not a more general issue.
-    test_output_activation_no_bias = dmatrix()
-    test_b2 = dvector()
-    test_target = ivector()
-    nll_softmax_argmax = crossentropy_softmax_argmax_1hot_with_bias(
-        test_output_activation_no_bias, test_b2, test_target
-    )
-    output = nll_softmax_argmax[1]
-    g = function(
-        [test_output_activation_no_bias, test_b2, test_target],
-        Out(output, borrow=False),
-    )
-    a = np.zeros((1, 5))
-    b = np.ones(5)
-    c = np.zeros(1, dtype=np.int32)
-    z = g(a, b, c)
-    z_backup = copy.copy(z)
-    id_z = id(z)
-    # print(f"Output z after first call: {z}")
-    a[0, 0] = 1
-    id_other = id(g(a, b, c))
-    # print(f"Output z after second call: {z}")
-    # Ensure that calling the function again returns a pointer towards a new
-    # array.
-    assert id_z != id_other
-    # Just to be 100% sure, ensure that z was not altered.
-    assert (z == z_backup).all()
-def test_deepcopied_type_filter():
-    a = copy.deepcopy(matrix())
-    # The following should run cleanly.
-    # As of commit 731e2d2fa68487733320d341d08b454a50c90d12
-    # it was failing.
-    a.type.filter(np.ones((2, 2), dtype=a.dtype), strict=True)