Remove tests.gpuarray

b3ce3640 · Maxim Kochurov · Brandon T. Willard · c803c67e · c803c67e · c803c67e
--- a/tests/gpuarray/__init__.py
+++ b/tests/gpuarray/__init__.py
--- a/tests/gpuarray/c_code/tstgpueye.c
+++ b/tests/gpuarray/c_code/tstgpueye.c
-#section kernels
-
-#kernel eye : *, size, size, size :
-#include <cluda.h>
-/* The eye name will be used to generate supporting objects.  The only
-   you probably need to care about is the kernel object which will be
-   named 'k_' + <the name above> (k_eye in this case).  This name also
-   has to match the kernel function name below.
- */
-
-KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size a_off, ga_size n, ga_size m) {
-  a = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)a) + a_off);
-  ga_size nb = n < m ? n : m;
-  for (ga_size i = LID_0; i < nb; i += LDIM_0) {
-    a[i*m + i] = 1;
-  }
-}
-
-#section support_code_struct
-
-int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
-                              PyGpuArrayObject **z, PARAMS_TYPE* params) {
-  size_t dims[2] = {0, 0};
-  size_t ls, gs;
-  void *args[3];
-  int err;
-
-  dims[0] = ((DTYPE_INPUT_0 *)PyArray_DATA(n))[0];
-  dims[1] = ((DTYPE_INPUT_1 *)PyArray_DATA(m))[0];
-
-  Py_XDECREF(*z);
-  *z = pygpu_zeros(2, dims,
-                   params->typecode,
-                   GA_C_ORDER,
-                   params->context, Py_None);
-  if (*z == NULL)
-    return -1;
-
-  ls = 1;
-  gs = 256;
-  /* The eye_call name comes from the kernel declaration above. */
-  err = eye_call(1, &gs, &ls, 0, (*z)->ga.data, (*z)->ga.offset, dims[0], dims[1]);
-  if (err != GA_NO_ERROR) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "gpuarray error: kEye: %s. n%lu, m=%lu.",
-                 GpuKernel_error(&k_eye, err),
-                 (unsigned long)dims[0], (unsigned long)dims[1]);
-    return -1;
-  }
-  return 0;
-}
--- a/tests/gpuarray/check_dnn_conv.py
+++ b/tests/gpuarray/check_dnn_conv.py
-#!/usr/bin/env python
-
-# Without args, this script executes all its tests like `pytest -vs`
-# python check_dnn_conv.py
-
-# If there is only one arg `infos`, this script prints some infos about
-# supported algorithms and data type configurations for current GPU and cuDNN version.
-# python check_dnn_conv.py infos
-
-# If there is only one arg `list`, this script prints all test cases without running them.
-# python check_dnn_conv.py list
-
-# Else, any arg will be directly passed to pytest.
-# python check_dnn_conv.py -xvs  # verbose mode, capture output, exit at first error.
-
-import math
-import sys
-from itertools import chain, product
-
-import numpy as np
-import pytest
-from aesarat.tensor.type import TensorType
-
-import aesara
-import tests.unittest_tools as utt
-from aesara.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
-from aesara.gpuarray import cudnn_defs
-from aesara.gpuarray.dnn import GpuDnnConv, GpuDnnConvGradI, GpuDnnConvGradW
-from aesara.gpuarray.dnn import _dnn_conv as dnn_conv
-from aesara.gpuarray.dnn import _dnn_gradinput as dnn_gradinput
-from aesara.gpuarray.dnn import _dnn_gradweight as dnn_gradweight
-from aesara.gpuarray.dnn import version
-from aesara.tensor.nnet.abstract_conv import assert_conv_shape, get_conv_output_shape
-from aesara.tensor.nnet.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights
-from aesara.tensor.nnet.corr3d import Corr3dMM, Corr3dMMGradInputs, Corr3dMMGradWeights
-from tests.gpuarray.config import mode_with_gpu, ref_cast
-
-
-def check_dtype_config_support(dtype, precision):
-    # We use FWD 2D to check it.
-    # Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
-    # should support all configurations, for both v5.1, v6 and v7.
-    inputs = aesara.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
-    filters = aesara.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
-    conv = dnn_conv(inputs, filters, precision=precision, algo="small")
-    f = aesara.function([], conv, mode=mode_with_gpu)
-    try:
-        f()
-    except RuntimeError as e:
-        assert "CUDNN_STATUS_ARCH_MISMATCH" in str(e)
-        return False
-    return True
-
-
-cudnn = cudnn_defs.get_definitions(version(raises=False))
-
-
-class ConvCase:
-    """
-    Helper class to describe a special test case quickly.
-    This handles only 2D and 3D cases.
-    """
-
-    FWD, GRADINPUT, GRADWEIGHT = 0, 1, 2
-
-    def __init__(
-        self,
-        type,
-        inputs_shape,
-        filters_shape,
-        algo=None,
-        dtype=None,
-        precision=None,
-        subsample=None,
-        dilation=None,
-        border_mode="valid",
-        conv_mode="conv",
-        alpha=1,
-        beta=0,
-        should_fail=False,
-    ):
-        assert type in (ConvCase.FWD, ConvCase.GRADINPUT, ConvCase.GRADWEIGHT)
-        assert len(inputs_shape) == len(filters_shape) in (4, 5)
-        ndim = len(inputs_shape) - 2
-        if dtype is None:
-            dtype = aesara.config.floatX
-        if precision is None:
-            precision = aesara.config.floatX
-        if subsample is None:
-            subsample = (1,) * ndim
-        if dilation is None:
-            dilation = (1,) * ndim
-        assert dtype in ("float16", "float32", "float64")
-        assert precision in ("float16", "float32", "float64")
-        assert len(subsample) == len(dilation) == ndim
-        assert border_mode in ("valid", "full", "half") or (
-            isinstance(border_mode, (list, tuple)) and len(border_mode) == ndim
-        )
-        assert conv_mode in ("conv", "cross")
-        assert alpha != 0
-
-        self.type = type
-        self.ndim = ndim
-        self.algo = algo
-        self.inputs_shape = inputs_shape
-        self.filters_shape = filters_shape
-        self.dtype = dtype
-        self.precision = precision
-        self.subsample = subsample
-        self.dilation = dilation
-        self.border_mode = border_mode
-        self.conv_mode = conv_mode
-        self.alpha = alpha
-        self.beta = beta
-        self.should_fail = bool(should_fail)
-
-    def is_fwd(self):
-        return self.type == ConvCase.FWD
-
-    def is_bwd_filter(self):
-        return self.type == ConvCase.GRADWEIGHT
-
-    def is_bwd_data(self):
-        return self.type == ConvCase.GRADINPUT
-
-    def get_case(self):
-        return (
-            self.algo,
-            self.dtype,
-            self.precision,
-            (
-                self.inputs_shape,
-                self.filters_shape,
-                self.subsample,
-                self.dilation,
-                self.border_mode,
-                self.conv_mode,
-                self.alpha,
-                self.beta,
-            ),
-        )
-
-    @staticmethod
-    def fwd(*args, **kwargs):
-        return ConvCase(ConvCase.FWD, *args, **kwargs)
-
-    @staticmethod
-    def bwd_filter(*args, **kwargs):
-        return ConvCase(ConvCase.GRADWEIGHT, *args, **kwargs)
-
-    @staticmethod
-    def bwd_data(*args, **kwargs):
-        return ConvCase(ConvCase.GRADINPUT, *args, **kwargs)
-
-
-class ConvCaseGenerator:
-    """
-    Main class used to generate test cases.
-    This handles only 2D and 3D cases.
-    """
-
-    def _as_tuple_of_tuples(self, iterable):
-        return tuple(tuple(sequence) for sequence in iterable)
-
-    def __init__(
-        self,
-        ndim,
-        alpha=2,
-        beta=-3,
-        batch_size=2,
-        input_channels=3,
-        inputs_sizes=None,
-        output_channels=2,
-        filters_sizes=None,
-        subsamples=None,
-        dilations=None,
-        borders=None,
-        with_border_valid=True,
-        with_border_half=True,
-        with_border_full=True,
-    ):
-        self.ndim = int(ndim)
-        self.alpha = float(alpha)
-        self.beta = float(beta)
-        self.batch_size = int(batch_size)
-        self.input_channels = int(input_channels)
-        self.output_channels = int(output_channels)
-
-        assert self.ndim in (2, 3)
-        assert self.alpha != 0
-        assert self.batch_size > 0
-        assert self.input_channels > 0
-        assert self.output_channels > 0
-
-        # NB: it is quite arbitrary to choose default values for inputs sizes and filters sizes.
-        # Here, we just put some values that may generate errors in some cases, but that should be OK for other cases.
-        # For instance, input size 300 is > 256, that is a limit for certain algorithms (cf. documentation).
-        # Filter size 40 is > 32 and > 16, that are limits for certain algorithms (cf. documentation).
-        # We should either manually specify sizes, or give an appropriate filter to this generator
-        # before testing values (see `self.get_cases()`).
-
-        if inputs_sizes is None:
-            inputs_sizes = ((5,) * self.ndim, (300, 5) + (2,) * (self.ndim - 2))
-        if filters_sizes is None:
-            filters_sizes = ((4,) * self.ndim, (40, 4) + (2,) * (self.ndim - 2))
-        if borders is None:
-            borders = ((1,) * self.ndim, tuple(range(1, self.ndim + 1)))
-        if subsamples is None:
-            subsamples = ((1,) * self.ndim, tuple(range(1, self.ndim + 1)))
-        if dilations is None:
-            dilations = ((1,) * self.ndim,)
-            if cudnn.version >= 6:
-                dilations += (tuple(range(1, self.ndim + 1)),)
-
-        for sequence_list in (
-            inputs_sizes,
-            filters_sizes,
-            borders,
-            subsamples,
-            dilations,
-        ):
-            assert isinstance(sequence_list, (tuple, list)) and all(
-                isinstance(sequence, (tuple, list)) and len(sequence) == self.ndim
-                for sequence in sequence_list
-            ), (self.ndim, sequence_list)
-
-        self.auto_borders = tuple()
-        if with_border_valid:
-            self.auto_borders += ("valid",)
-        if with_border_half:
-            self.auto_borders += ("half",)
-        if with_border_full:
-            self.auto_borders += ("full",)
-
-        self.inputs_sizes = self._as_tuple_of_tuples(inputs_sizes)
-        self.filters_sizes = self._as_tuple_of_tuples(filters_sizes)
-        self.borders = self._as_tuple_of_tuples(borders)
-        self.subsamples = self._as_tuple_of_tuples(subsamples)
-        self.dilations = self._as_tuple_of_tuples(dilations)
-
-    @staticmethod
-    def get_if_valid_conv_output_shape(case_tuple):
-        # Filter function to keep only cases that produce valid convolution output shapes.
-        out_shp = get_conv_output_shape(
-            case_tuple[0],  # input shape
-            case_tuple[1],  # filter shape
-            case_tuple[4],  # border mode
-            case_tuple[2],  # subsample
-            case_tuple[3],
-        )  # dilation
-        try:
-            return assert_conv_shape(out_shp)
-        except ValueError:
-            return False
-
-    def get_cases(self, filter=None):
-        # Generate an iterator of tuples with format:
-        # (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
-        # filter may be a callable that gets one tuple (with format specified above) and returns
-        # a boolean, so that tuple is kept only if filter(tuple) is True.
-
-        all_batch_sizes = (self.batch_size,)
-        all_input_channels = (self.input_channels,)
-        all_input_sizes = self.inputs_sizes
-        all_output_channels = (self.output_channels,)
-        all_filter_sizes = self.filters_sizes
-        all_subsamples = self.subsamples
-        all_dilations = self.dilations
-        all_border_modes = self.auto_borders + self.borders
-        all_conv_modes = ("conv", "cross")
-        all_alphas = (self.alpha,)
-        all_betas = (0,) if self.beta == 0 else (0, self.beta)
-
-        all_input_shapes = (
-            (bs, ic) + ins
-            for bs in all_batch_sizes
-            for ic in all_input_channels
-            for ins in all_input_sizes
-        )
-        all_filter_shapes = (
-            (oc, ic) + fis
-            for oc in all_output_channels
-            for ic in all_input_channels
-            for fis in all_filter_sizes
-        )
-        if callable(filter):
-
-            def local_filter(case_tuple):
-                return ConvCaseGenerator.get_if_valid_conv_output_shape(
-                    case_tuple
-                ) and filter(case_tuple)
-
-        else:
-            local_filter = ConvCaseGenerator.get_if_valid_conv_output_shape
-        return filter(
-            local_filter,
-            product(
-                all_input_shapes,
-                all_filter_shapes,
-                all_subsamples,
-                all_dilations,
-                all_border_modes,
-                all_conv_modes,
-                all_alphas,
-                all_betas,
-            ),
-        )
-
-
-class ConvCaseGeneratorChain:
-    """
-    Helper class to concatenate many conv case generators.
-    """
-
-    def __init__(self, *conv_case_generators):
-        assert all(isinstance(g, ConvCaseGenerator) for g in conv_case_generators)
-        self.generators = conv_case_generators
-
-    def get_cases(self, filter=None):
-        return chain(*[generator.get_cases(filter) for generator in self.generators])
-
-
-class CuDNNV51ConvCaseGenerator:
-    """
-    Helper class to generate specific test cases for every algorithm supported by cuDNN V5.1.
-    Same class exists for cuDNN V6.0 (see below).
-    This should help avoid test cases that are intended to fail according to cuDNN documentation.
-    """
-
-    NONE = "none"
-    FFT = "fft"
-    FFT_TILING = "fft_tiling"
-    WINOGRAD = "winograd"
-    WINOGRAD_NON_FUSED = "winograd_non_fused"
-
-    # Protected interface.
-
-    def _dilations(self, ndim):
-        return [(1,) * ndim]
-
-    def _fwd_fft(self, ndim):
-        inputs_sizes = [(10,) * ndim, (240, 5) + (2,) * (ndim - 2)]
-        filters_sizes = [tuple(range(9, 9 - ndim, -1))]
-        subsamples = [(1,) * ndim]
-        return ConvCaseGenerator(
-            ndim=ndim,
-            inputs_sizes=inputs_sizes,
-            filters_sizes=filters_sizes,
-            subsamples=subsamples,
-            dilations=self._dilations(ndim),
-        )
-
-    def _fwd_fft_tiling(self, ndim, dtype, precision):
-        if ndim == 2:
-            filters_sizes = [(32, 5)]
-        if ndim == 3:
-            filters_sizes = [(16, 5, 5)]
-        subsamples = [(1,) * ndim]
-        return ConvCaseGenerator(
-            ndim=ndim,
-            filters_sizes=filters_sizes,
-            subsamples=subsamples,
-            dilations=self._dilations(ndim),
-        )
-
-    def _fwd_winograd(self, ndim):
-        filters_sizes = [(3,) * ndim]
-        subsamples = [(1,) * ndim]
-        return ConvCaseGenerator(
-            ndim=ndim,
-            filters_sizes=filters_sizes,
-            subsamples=subsamples,
-            dilations=self._dilations(ndim),
-        )
-
-    def _fwd_winograd_non_fused(self, ndim, dtype, precision):
-        filters_sizes = [(3,) * ndim]
-        if not (dtype == precision == "float16"):
-            filters_sizes += [(5,) * ndim]
-        subsamples = [(1,) * ndim]
-        return ConvCaseGenerator(
-            ndim=ndim,
-            filters_sizes=filters_sizes,
-            subsamples=subsamples,
-            dilations=self._dilations(ndim),
-        )
-
-    def _gw_fft(self, ndim):
-        return self._fwd_fft(ndim)
-
-    def _gw_winograd_non_fused(self, ndim, dtype, precision):
-        return self._fwd_winograd_non_fused(ndim, dtype, precision)
-
-    def _gi_fft(self, ndim):
-        return self._fwd_fft(ndim)
-
-    def _gi_fft_tiling(self, ndim, dtype, precision):
-        return self._fwd_fft_tiling(ndim, dtype, precision)
-
-    def _gi_winograd(self, ndim):
-        return self._fwd_winograd(ndim)
-
-    def _gi_winograd_non_fused(self, ndim, dtype, precision):
-        return self._fwd_winograd_non_fused(ndim, dtype, precision)
-
-    def _fwd_runtime(self, ndim, dtype, precision):
-        return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-
-    def _gw_runtime(self, ndim, dtype, precision):
-        return self._fwd_runtime(ndim, dtype, precision)
-
-    def _gi_runtime(self, ndim, dtype, precision):
-        return self._fwd_runtime(ndim, dtype, precision)
-
-    # Public interface.
-
-    def fwd(self, algo, ndim, dtype, precision):
-        if algo == self.FFT:
-            return self._fwd_fft(ndim)
-        if algo == self.FFT_TILING:
-            return self._fwd_fft_tiling(ndim, dtype, precision)
-        if algo == self.WINOGRAD:
-            return self._fwd_winograd(ndim)
-        if algo == self.WINOGRAD_NON_FUSED:
-            return self._fwd_winograd_non_fused(ndim, dtype, precision)
-        if algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-            return self._fwd_runtime(ndim, dtype, precision)
-        return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-
-    def gw(self, algo, ndim, dtype, precision):
-        if algo == self.FFT:
-            return self._gw_fft(ndim)
-        if algo == self.WINOGRAD_NON_FUSED:
-            return self._gw_winograd_non_fused(ndim, dtype, precision)
-        if algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-            return self._gw_runtime(ndim, dtype, precision)
-        return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-
-    def gi(self, algo, ndim, dtype, precision):
-        if algo == self.FFT:
-            return self._gi_fft(ndim)
-        if algo == self.FFT_TILING:
-            return self._gi_fft_tiling(ndim, dtype, precision)
-        if algo == self.WINOGRAD:
-            return self._gi_winograd(ndim)
-        if algo == self.WINOGRAD_NON_FUSED:
-            return self._gi_winograd_non_fused(ndim, dtype, precision)
-        if algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-            return self._gi_runtime(ndim, dtype, precision)
-        return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-
-
-class CuDNNV6ConvCaseGenerator(CuDNNV51ConvCaseGenerator):
-    def _fwd_none(self, ndim):
-        # All dilations allowed.
-        return ConvCaseGenerator(ndim=ndim)
-
-    def _fwd_fft_tiling(self, ndim, dtype, precision):
-        if ndim == 2:
-            subsamples = [(1, 1)]
-            # wDesc's filter height must be greater than convDesc's zero-padding height
-            # wDesc's filter width must be greater than convDesc's zero-padding width
-            generators = []
-            if (dtype, precision) != ("float64", "float64"):
-                # Filter sizes with every dimension != 1 is not supported for DOUBLE_CONFIG.
-                filters_sizes = [(32, 5), (10, 10)]
-                borders = [(1, 1), (6, 4)]
-                generators += [
-                    ConvCaseGenerator(
-                        ndim=ndim,
-                        dilations=self._dilations(ndim),
-                        subsamples=subsamples,
-                        filters_sizes=filters_sizes,
-                        borders=borders,
-                    )
-                ]
-            filters_sizes = [(256, 1), (5, 1)]
-            borders = [(1, 0), (2, 0)]
-            generators += [
-                ConvCaseGenerator(
-                    ndim=ndim,
-                    dilations=self._dilations(ndim),
-                    subsamples=subsamples,
-                    filters_sizes=filters_sizes,
-                    borders=borders,
-                )
-            ]
-            return ConvCaseGeneratorChain(*generators)
-        if ndim == 3:
-            return super()._fwd_fft_tiling(ndim, dtype, precision)
-
-    def _gw_none(self, ndim):
-        return self._fwd_none(ndim)
-
-    def _gw_fft_tiling(self, ndim):
-        inputs_sizes = [(247, 1), (20, 1)]
-        filters_sizes = [(3, 1), (10, 1)]
-        subsamples = [(1,) * ndim]
-        borders = [(1, 0), (2, 0)]
-        return ConvCaseGenerator(
-            ndim=ndim,
-            inputs_sizes=inputs_sizes,
-            filters_sizes=filters_sizes,
-            subsamples=subsamples,
-            borders=borders,
-            dilations=self._dilations(ndim),
-        )
-
-    def _gi_none(self, ndim):
-        return self._fwd_none(ndim)
-
-    def _fwd_runtime(self, ndim, dtype, precision):
-        if ndim == 2 and dtype == precision == "float16":
-            return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-        return super()._fwd_runtime(ndim, dtype, precision)
-
-    def _gw_runtime(self, ndim, dtype, precision):
-        if ndim == 2 and dtype == precision == "float16":
-            return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-        return super()._gw_runtime(ndim, dtype, precision)
-
-    def _gi_runtime(self, ndim, dtype, precision):
-        if ndim == 2 and dtype == precision == "float16":
-            return ConvCaseGenerator(ndim=ndim, dilations=self._dilations(ndim))
-        return super()._gi_runtime(ndim, dtype, precision)
-
-    def fwd(self, algo, ndim, dtype, precision):
-        if algo == self.NONE:
-            return self._fwd_none(ndim)
-        return super().fwd(algo, ndim, dtype, precision)
-
-    def gw(self, algo, ndim, dtype, precision):
-        if algo == self.NONE:
-            return self._gw_none(ndim)
-        if algo == self.FFT_TILING:
-            return self._gw_fft_tiling(ndim)
-        return super().gw(algo, ndim, dtype, precision)
-
-    def gi(self, algo, ndim, dtype, precision):
-        if algo == self.NONE:
-            return self._gi_none(ndim)
-        return super().gi(algo, ndim, dtype, precision)
-
-
-cudnn_conv_case_generator = (
-    CuDNNV51ConvCaseGenerator() if cudnn.version < 6 else CuDNNV6ConvCaseGenerator()
-)
-
-
-class BaseTestDnnConv:
-    """
-    Base class for exhaustive tests. Use its subclasses
-    to run actual tests.
-    """
-
-    # Abstract attributes.
-
-    ndim = 2
-
-    fwd_algorithms = None
-    bwd_filter_algorithms = None
-    bwd_data_algorithms = None
-
-    cpu_conv_class = None
-    cpu_gradinput_class = None
-    cpu_gradweight_class = None
-
-    special_cases = []  # List of special ConvCases.
-
-    runtime_shapes = (
-        []
-    )  # Tuple of tuples with format: n_times, (inputs_shape, filters_shape)
-
-    # Utility methods.
-
-    def _next_ten_exponent(self, val):
-        # Return exponent for the next ten power that follows val.
-        # val should be a positive integer.
-        # Examples:
-        # for 0 to 9, returns 1 (=> 10**1 == 10)
-        # for 10 to 99, returns 2 (=> 10**2 == 100)
-        ten_exponent = 1
-        while val // 10 > 0:
-            ten_exponent += 1
-            val //= 10
-        return ten_exponent
-
-    def scale_numpy_arrays_inplace(self, A, B, alpha):
-        scale_factor = 1
-        # Scale down simultaneously A and B if alpha is not 1.
-        if alpha != 1:
-            scale_factor *= alpha
-        # Normalize A and B simultaneously so that any values in these tensors are in interval [0, 1)
-        max_a = math.floor(abs(A.max()))
-        max_b = math.floor(abs(B.max()))
-        if max_a or max_b:
-            m_a = self._next_ten_exponent(max_a)
-            m_b = self._next_ten_exponent(max_b)
-            max_m = max(m_a, m_b)
-            scale_factor *= 10**max_m
-        if scale_factor != 1:
-            A /= scale_factor
-            B /= scale_factor
-
-    def get_atol_rtol(self, algo, dtype, precision):
-        if dtype == "float16":
-            # Raise tolerance for float16
-            return (5e-2, 5e-2)
-        if algo == "winograd_non_fused" and dtype == precision == "float32":
-            # Raise tolerance for winograd_non_fused in FLOAT_CONFIG.
-            return (1e-4, 1e-4)
-        return None, None
-
-    def __init__(self):
-        self.dtype_configs = cudnn.get_supported_dtype_configs(
-            check_dtype_config_support
-        )
-
-    def array_like_conv_output(
-        self, inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
-    ):
-        # Return a random array with inferred convolution output shape.
-        out_shp = get_conv_output_shape(
-            inputs_shape, filters_shape, border_mode, subsample, dilation
-        )
-        out_shp = assert_conv_shape(out_shp)
-        return np.random.random(out_shp).astype(dtype)
-
-    def run_conv_fwd(self, algo, dtype, precision, parameters):
-        (
-            inputs_shape,
-            filters_shape,
-            subsample,
-            dilation,
-            border_mode,
-            conv_mode,
-            alpha,
-            beta,
-        ) = parameters
-
-        inputs_val = np.random.random(inputs_shape).astype(dtype)
-        filters_val = np.random.random(filters_shape).astype(dtype)
-
-        # Scale down the input values to prevent very large absolute errors
-        # due to float rounding
-        inputs_val /= 10
-        filters_val /= 10
-
-        inputs = aesara.shared(inputs_val)
-        filters = aesara.shared(filters_val)
-
-        if beta == 0:
-            out = None
-        else:
-            out = self.array_like_conv_output(
-                inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
-            )
-            out /= 10
-        # Compile an Aesara function for the cuDNN implementation
-        conv = dnn_conv(
-            img=inputs,
-            kerns=filters,
-            alpha=alpha,
-            beta=beta,
-            out=out,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=dilation,
-            conv_mode=conv_mode,
-            algo=algo,
-            precision=precision,
-        )
-        f = aesara.function([], conv, mode=mode_with_gpu)
-
-        # If conv_mode is 'conv' the reference implementation should use
-        # filters flipped according to the width, height and time axis
-        if conv_mode == "conv":
-            if inputs.ndim == 5:
-                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
-            else:
-                flipped_filters = filters[:, :, ::-1, ::-1]
-        else:
-            flipped_filters = filters
-
-        # Compile an Aesara function for the reference implementation
-        conv_ref = self.cpu_conv_class(
-            border_mode=border_mode, subsample=subsample, filter_dilation=dilation
-        )(ref_cast(inputs), flipped_filters)
-        f_ref = aesara.function([], conv_ref, mode="FAST_RUN")
-
-        # Compare the results of the two implementations
-        res_ref = f_ref()
-        res = np.asarray(f())
-        if algo in cudnn.deterministic_fwd_algorithms:
-            utt.assert_allclose(res, np.asarray(f()))
-
-        atol, rtol = self.get_atol_rtol(algo, dtype, precision)
-        if beta == 0:
-            cpu_res = alpha * res_ref
-        else:
-            cpu_res = alpha * res_ref + beta * out
-        self.scale_numpy_arrays_inplace(cpu_res, res, alpha)
-        utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
-
-    def run_conv_gradinput(self, algo, dtype, precision, parameters):
-        (
-            inputs_shape,
-            filters_shape,
-            subsample,
-            dilation,
-            border_mode,
-            conv_mode,
-            alpha,
-            beta,
-        ) = parameters
-
-        if beta == 0:
-            inputs_val = None
-        else:
-            inputs_val = np.random.random(inputs_shape).astype(dtype)
-            inputs_val /= 10
-        filters_val = np.random.random(filters_shape).astype(dtype)
-        topgrad_val = self.array_like_conv_output(
-            inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
-        )
-
-        # Scale down the input values to prevent absolute errors in utt.assert_allclose.
-        filters_val /= 10
-        topgrad_val /= 10
-
-        filters = aesara.shared(filters_val)
-        topgrad = aesara.shared(topgrad_val)
-
-        # Compile an Aesara function for the cuDNN implementation
-        grad_i = dnn_gradinput(
-            filters,
-            topgrad,
-            inputs_shape,
-            alpha=alpha,
-            beta=beta,
-            out=inputs_val,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=dilation,
-            conv_mode=conv_mode,
-            algo=algo,
-            precision=precision,
-        )
-
-        f = aesara.function([], grad_i, mode=mode_with_gpu)
-
-        # If conv_mode is 'conv' the reference implementation should use
-        # filters flipped according to the width, height and time axis
-        if conv_mode == "conv":
-            if filters.ndim == 5:
-                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
-            else:
-                flipped_filters = filters[:, :, ::-1, ::-1]
-        else:
-            flipped_filters = filters
-
-        # Compile an Aesara function for the reference implementation
-        grad_i_ref = self.cpu_gradinput_class(
-            border_mode=border_mode, subsample=subsample, filter_dilation=dilation
-        )(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:])
-        f_ref = aesara.function([], grad_i_ref, mode="FAST_RUN")
-
-        # Compare the results of the two implementations
-        res_ref = f_ref()
-        res = np.asarray(f())
-        if algo in cudnn.deterministic_bwd_data_algorithms:
-            utt.assert_allclose(res, np.asarray(f()))
-
-        atol, rtol = self.get_atol_rtol(algo, dtype, precision)
-        if beta == 0:
-            cpu_res = alpha * res_ref
-        else:
-            cpu_res = alpha * res_ref + beta * inputs_val
-        self.scale_numpy_arrays_inplace(cpu_res, res, alpha)
-        utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
-
-    def run_conv_gradweight(self, algo, dtype, precision, parameters):
-        (
-            inputs_shape,
-            filters_shape,
-            subsample,
-            dilation,
-            border_mode,
-            conv_mode,
-            alpha,
-            beta,
-        ) = parameters
-
-        inputs_val = np.random.random(inputs_shape).astype(dtype)
-        if beta == 0:
-            filters_val = None
-        else:
-            filters_val = np.random.random(filters_shape).astype(dtype)
-            filters_val /= 10
-        topgrad_val = self.array_like_conv_output(
-            inputs_shape, filters_shape, border_mode, subsample, dilation, dtype
-        )
-
-        # Scale down the input values to prevent absolute errors in utt.assert_allclose.
-        inputs_val /= 10
-        topgrad_val /= 10
-
-        inputs = aesara.shared(inputs_val)
-        topgrad = aesara.shared(topgrad_val)
-
-        # Compile an Aesara function for the cuDNN implementation
-        grad_w = dnn_gradweight(
-            inputs,
-            topgrad,
-            filters_shape,
-            alpha=alpha,
-            beta=beta,
-            out=filters_val,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=dilation,
-            conv_mode=conv_mode,
-            algo=algo,
-            precision=precision,
-        )
-
-        f = aesara.function([], grad_w, mode=mode_with_gpu)
-
-        # Compile an Aesara function for the reference implementation
-        grad_w_ref = self.cpu_gradweight_class(
-            border_mode=border_mode, subsample=subsample, filter_dilation=dilation
-        )(ref_cast(inputs), ref_cast(topgrad), filters_shape[2:])
-        if conv_mode == "conv":
-            if inputs.ndim == 5:
-                grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1]
-            else:
-                grad_w_ref = grad_w_ref[:, :, ::-1, ::-1]
-        f_ref = aesara.function([], grad_w_ref, mode="FAST_RUN")
-
-        # Compare the results of the two implementations
-        res_ref = f_ref()
-        res = np.asarray(f())
-        if algo in cudnn.deterministic_bwd_filter_algorithms:
-            utt.assert_allclose(res, np.asarray(f()))
-
-        atol, rtol = self.get_atol_rtol(algo, dtype, precision)
-        if beta == 0:
-            cpu_res = alpha * res_ref
-        else:
-            cpu_res = alpha * res_ref + beta * filters_val
-        self.scale_numpy_arrays_inplace(cpu_res, res, alpha)
-        utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
-
-    def should_fail(self, function, *args):
-        try:
-            print("(should fail)", file=sys.stderr, end=" ")
-            function(*args)
-        except Exception:
-            pass
-        else:
-            raise AssertionError("Should fail", callable.__name__, *args)
-
-    def should_fail_fwd(self, *args):
-        self.should_fail(self.run_conv_fwd, *args)
-
-    def should_fail_gradinput(self, *args):
-        self.should_fail(self.run_conv_gradinput, *args)
-
-    def should_fail_gradweight(self, *args):
-        self.should_fail(self.run_conv_gradweight, *args)
-
-    def get_expected_tcount(self):
-        """Utility function to get expected test count without actually running pytest."""
-        return (
-            sum(1 for t in self.test_fwd())
-            + sum(1 for t in self.test_gradweight())
-            + sum(1 for t in self.test_gradinput())
-            + sum(1 for t in self.test_fwd_runtime_algorithms())
-            + sum(1 for t in self.test_gradweight_runtime_algorithms())
-            + sum(1 for t in self.test_gradinput_runtime_algorithms())
-        )
-
-    # Iterable test methods.
-
-    def test_fwd(self):
-        for dtype, precision in self.dtype_configs:
-            algos = [
-                algo
-                for algo in self.fwd_algorithms
-                if cudnn.fwd_algo_supports_dtype_config(
-                    algo, dtype, precision, self.ndim
-                )
-            ]
-            for algo in algos:
-                for parameters in cudnn_conv_case_generator.fwd(
-                    algo, self.ndim, dtype, precision
-                ).get_cases():
-                    self.run_conv_fwd(algo, dtype, precision, parameters)
-            if algos:
-                # Some algorithms support current data type configuration for current ndim.
-                # So, an algorithm could be chosen at runtime.
-                for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-                    for parameters in cudnn_conv_case_generator.fwd(
-                        algo, self.ndim, dtype, precision
-                    ).get_cases():
-                        self.run_conv_fwd(algo, dtype, precision, parameters)
-        for dnn_case in self.special_cases:
-            if dnn_case.is_fwd():
-                if dnn_case.should_fail:
-                    self.should_fail_fwd(dnn_case.get_case())
-                else:
-                    self.run_conv_fwd(dnn_case.get_case())
-
-    def test_gradinput(self):
-        for dtype, precision in self.dtype_configs:
-            algos = [
-                algo
-                for algo in self.bwd_data_algorithms
-                if cudnn.bwd_data_algo_supports_dtype_config(
-                    algo, dtype, precision, self.ndim
-                )
-            ]
-            for algo in algos:
-                for parameters in cudnn_conv_case_generator.gi(
-                    algo, self.ndim, dtype, precision
-                ).get_cases():
-                    self.run_conv_gradinput(algo, dtype, precision, parameters)
-            if algos:
-                # Some algorithms support current data type configuration for current ndim.
-                # So, an algorithm could be chosen at runtime.
-                for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-                    for parameters in cudnn_conv_case_generator.gi(
-                        algo, self.ndim, dtype, precision
-                    ).get_cases():
-                        self.run_conv_gradinput(algo, dtype, precision, parameters)
-        for dnn_case in self.special_cases:
-            if dnn_case.is_bwd_data():
-                if dnn_case.should_fail:
-                    self.should_fail_gradinput(dnn_case.get_case())
-                else:
-                    self.run_conv_gradinput(dnn_case.get_case())
-
-    def test_gradweight(self):
-        for dtype, precision in self.dtype_configs:
-            algos = [
-                algo
-                for algo in self.bwd_filter_algorithms
-                if cudnn.bwd_filter_algo_supports_dtype_config(
-                    algo, dtype, precision, self.ndim
-                )
-            ]
-            for algo in algos:
-                for parameters in cudnn_conv_case_generator.gw(
-                    algo, self.ndim, dtype, precision
-                ).get_cases():
-                    self.run_conv_gradweight(algo, dtype, precision, parameters)
-            if algos:
-                # Some algorithms support current data type configuration for current ndim.
-                # So, an algorithm could be chosen at runtime.
-                for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-                    for parameters in cudnn_conv_case_generator.gw(
-                        algo, self.ndim, dtype, precision
-                    ).get_cases():
-                        self.run_conv_gradweight(algo, dtype, precision, parameters)
-        for dnn_case in self.special_cases:
-            if dnn_case.is_bwd_filter():
-                if dnn_case.should_fail:
-                    self.should_fail_gradweight(dnn_case.get_case())
-                else:
-                    self.run_conv_gradweight(dnn_case.get_case())
-
-    # The 3 following tests are intended to be run with aesara flag `cmodule__debug=True`.
-    # The output message should then be analyzed to check if runtime algorithms are
-    # reused, reloaded from cache or updated, depending on what we expect from
-    # dnn_fwd/dnn_gi/dnn_gw current codes. I currently don't know a better way
-    # to efficiently test implemented cuDNN convolution caches.
-
-    def test_fwd_runtime_algorithms(self):
-        dtype = "float32"
-        unit_shape = (1,) * self.ndim
-        _broadcastable = [False] * (2 + self.ndim)
-
-        def run_fwd_runtime_algorithm(algo):
-            inputs = TensorType(dtype, _broadcastable)()
-            filters = TensorType(dtype, _broadcastable)()
-            # Scale down the input values to prevent very large absolute errors
-            # due to float rounding
-            lower_inputs = inputs / 10
-            lower_filters = filters / 10
-            conv = dnn_conv(
-                img=lower_inputs,
-                kerns=lower_filters,
-                algo=algo,
-                precision=dtype,
-                subsample=unit_shape,
-                dilation=unit_shape,
-            )
-            f = aesara.function([inputs, filters], conv, mode=mode_with_gpu)
-            if self.ndim == 3:
-                flipped_filters = lower_filters[:, :, ::-1, ::-1, ::-1]
-            else:
-                flipped_filters = lower_filters[:, :, ::-1, ::-1]
-            conv_ref = self.cpu_conv_class(subsample=unit_shape)(
-                ref_cast(lower_inputs), flipped_filters
-            )
-            f_ref = aesara.function([inputs, filters], conv_ref, mode="FAST_RUN")
-            runtime_shapes = self.runtime_shapes
-            if algo in ("time_once", "guess_once"):
-                runtime_shapes = [list(runtime_shapes[0])]
-                runtime_shapes[0][0] = 5
-            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
-                print("Shapes:", inputs_shape, filters_shape)
-                for i in range(ntimes):
-                    inputs_val = np.random.random(inputs_shape).astype(dtype)
-                    filters_val = np.random.random(filters_shape).astype(dtype)
-                    gpu_res = np.asarray(f(inputs_val, filters_val))
-                    cpu_res = f_ref(inputs_val, filters_val)
-                    self.scale_numpy_arrays_inplace(cpu_res, gpu_res, 1)
-                    utt.assert_allclose(cpu_res, gpu_res)
-
-        for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-            run_fwd_runtime_algorithm(algo)
-
-    def test_gradinput_runtime_algorithms(self):
-        dtype = "float32"
-        unit_shape = (1,) * self.ndim
-        _broadcastable = [False] * (2 + self.ndim)
-
-        def run_gradinput_runtime_algorithm(algo):
-            aesara.config.dnn__conv__algo_bwd_data = algo
-            inputs = TensorType(dtype, _broadcastable)()
-            filters = TensorType(dtype, _broadcastable)()
-            conv = dnn_conv(
-                img=inputs,
-                kerns=filters,
-                algo=algo,
-                precision=dtype,
-                subsample=unit_shape,
-                dilation=unit_shape,
-            )
-            grad_i = aesara.gradient.grad(conv.sum(), [inputs])
-            f = aesara.function([inputs, filters], grad_i, mode=mode_with_gpu)
-            assert 1 == len(
-                [
-                    node
-                    for node in f.maker.fgraph.apply_nodes
-                    if isinstance(node.op, GpuDnnConvGradI)
-                ]
-            )
-            assert not any(
-                isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes
-            )
-            assert not any(
-                isinstance(node.op, GpuDnnConvGradW)
-                for node in f.maker.fgraph.apply_nodes
-            )
-            if self.ndim == 3:
-                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
-            else:
-                flipped_filters = filters[:, :, ::-1, ::-1]
-            conv_ref = self.cpu_conv_class(subsample=unit_shape)(
-                ref_cast(inputs), flipped_filters
-            )
-            grad_i_ref = aesara.gradient.grad(conv_ref.sum(), [inputs])
-            f_ref = aesara.function([inputs, filters], grad_i_ref, mode="FAST_RUN")
-            runtime_shapes = self.runtime_shapes
-            if algo in ("time_once", "guess_once"):
-                runtime_shapes = [list(runtime_shapes[0])]
-                runtime_shapes[0][0] = 5
-            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
-                print("Shapes:", inputs_shape, filters_shape)
-                for i in range(ntimes):
-                    inputs_val = np.random.random(inputs_shape).astype(dtype)
-                    filters_val = np.random.random(filters_shape).astype(dtype)
-                    gpu_res = f(inputs_val, filters_val)
-                    cpu_res = f_ref(inputs_val, filters_val)
-                    utt.assert_allclose(cpu_res, np.asarray(gpu_res))
-
-        for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-            run_gradinput_runtime_algorithm(algo)
-
-    def test_gradweight_runtime_algorithms(self):
-        dtype = "float32"
-        unit_shape = (1,) * self.ndim
-        _broadcastable = [False] * (2 + self.ndim)
-
-        def run_gradweight_runtime_algorithm(algo):
-            with aesara.config.change_flags(dnn__conv__algo_bwd_filter=algo):
-                inputs = TensorType(dtype, _broadcastable)()
-                filters = TensorType(dtype, _broadcastable)()
-                conv = dnn_conv(
-                    img=inputs,
-                    kerns=filters,
-                    algo=algo,
-                    precision=dtype,
-                    subsample=unit_shape,
-                    dilation=unit_shape,
-                )
-                grad_w = aesara.gradient.grad(conv.sum(), [filters])
-                f = aesara.function([inputs, filters], grad_w, mode=mode_with_gpu)
-                assert 1 == len(
-                    [
-                        node
-                        for node in f.maker.fgraph.apply_nodes
-                        if isinstance(node.op, GpuDnnConvGradW)
-                    ]
-                )
-                assert not any(
-                    isinstance(node.op, GpuDnnConv)
-                    for node in f.maker.fgraph.apply_nodes
-                )
-                assert not any(
-                    isinstance(node.op, GpuDnnConvGradI)
-                    for node in f.maker.fgraph.apply_nodes
-                )
-                if self.ndim == 3:
-                    flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
-                else:
-                    flipped_filters = filters[:, :, ::-1, ::-1]
-                conv_ref = self.cpu_conv_class(subsample=unit_shape)(
-                    ref_cast(inputs), flipped_filters
-                )
-                grad_w_ref = aesara.gradient.grad(conv_ref.sum(), [filters])
-                f_ref = aesara.function([inputs, filters], grad_w_ref, mode="FAST_RUN")
-                runtime_shapes = self.runtime_shapes
-                if algo in ("time_once", "guess_once"):
-                    runtime_shapes = [list(runtime_shapes[0])]
-                    runtime_shapes[0][0] = 5
-                for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
-                    print("Shapes:", inputs_shape, filters_shape)
-                    for i in range(ntimes):
-                        inputs_val = np.random.random(inputs_shape).astype(dtype)
-                        filters_val = np.random.random(filters_shape).astype(dtype)
-                        gpu_res = f(inputs_val, filters_val)
-                        cpu_res = f_ref(inputs_val, filters_val)
-                        utt.assert_allclose(cpu_res, np.asarray(gpu_res))
-
-        for algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-            run_gradweight_runtime_algorithm(algo)
-
-
-class TestDnnConv2D(BaseTestDnnConv):
-    ndim = 2
-
-    fwd_algorithms = cudnn.cudnnConvolutionFwdAlgo_t.get_aliases()
-    bwd_filter_algorithms = cudnn.cudnnConvolutionBwdFilterAlgo_t.get_aliases()
-    bwd_data_algorithms = cudnn.cudnnConvolutionBwdDataAlgo_t.get_aliases()
-
-    cpu_conv_class = CorrMM
-    cpu_gradinput_class = CorrMM_gradInputs
-    cpu_gradweight_class = CorrMM_gradWeights
-
-    special_cases = [
-        ConvCase.bwd_filter(
-            algo="deterministic",
-            dtype="float32",
-            precision="float32",
-            inputs_shape=(1, 1, 541211, 10),
-            filters_shape=(50, 1, 3, 10),
-            border_mode=(1, 0),
-            should_fail=(cudnn.version <= 6),
-        ),
-        ConvCase.fwd(
-            algo="small",
-            dtype="float32",
-            precision="float32",
-            inputs_shape=(65536, 2, 2, 2),
-            filters_shape=(1, 2, 2, 2),
-        ),
-        # NB: Due to current workaround (see dnn_fwd.c), this test won't fail for cuDNN < v6100.
-        ConvCase.fwd(
-            algo="small",
-            dtype="float32",
-            precision="float32",
-            inputs_shape=(65537, 2, 2, 2),
-            filters_shape=(1, 2, 2, 2),
-        ),
-    ]
-
-    runtime_shapes = [
-        (3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
-        (1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
-        (1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
-        (3, [(2, 3, 10, 9), (5, 3, 7, 7)]),  # cache should be used
-        (1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
-        (1, [(1, 1, 100, 200), (1, 1, 50, 200)]),  # cache should be used
-        (1, [(4, 2, 20, 20), (2, 2, 20, 19)]),  # cache should be used
-        (1, [(1, 2, 3, 4), (6, 2, 2, 1)]),
-    ]
-
-
-class TestDnnConv3D(BaseTestDnnConv):
-    ndim = 3
-
-    fwd_algorithms = cudnn.conv3d_fwd_algorithms
-    bwd_filter_algorithms = cudnn.conv3d_bwd_filter_algorithms
-    bwd_data_algorithms = cudnn.conv3d_bwd_data_algorithms
-
-    cpu_conv_class = Corr3dMM
-    cpu_gradinput_class = Corr3dMMGradInputs
-    cpu_gradweight_class = Corr3dMMGradWeights
-
-    special_cases = [
-        ConvCase.fwd(
-            algo="small",
-            dtype="float32",
-            precision="float32",
-            inputs_shape=(65536, 2, 2, 2, 2),
-            filters_shape=(1, 2, 2, 2, 2),
-        ),
-        # NB: Due to current workaround (see dnn_fwd.c), this test won't fail for cuDNN < v6100.
-        ConvCase.fwd(
-            algo="small",
-            dtype="float32",
-            precision="float32",
-            inputs_shape=(65537, 2, 2, 2, 2),
-            filters_shape=(1, 2, 2, 2, 2),
-        ),
-    ]
-
-    runtime_shapes = [
-        (3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
-        (1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
-        (1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
-        (3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),  # cache should be used
-        (1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
-        (1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),  # cache should be used
-        (1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),  # cache should be used
-        (1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)]),
-    ]
-
-
-def test_true_half_config_support():
-    # For cuDNN V5.1 and V6.0:
-    # "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0)"
-    if not check_dtype_config_support("float16", "float16"):
-        pytest.skip("FWD: TRUE_HALF_CONFIG not supported on this GPU.")
-
-
-class CheckDnn:
-    """
-    Utility functions for scripting and infos printing.
-    """
-
-    @staticmethod
-    def dtype_config_to_str(dtype_config):
-        dtype, precision = dtype_config
-        if dtype == precision == "float16":
-            return "TRUE_HALF_CONFIG"
-        if dtype == "float16" and precision == "float32":
-            return "PSEUDO_HALF_CONFIG"
-        if dtype == precision == "float32":
-            return "FLOAT_CONFIG"
-        if dtype == precision == "float64":
-            return "DOUBLE_CONFIG"
-        raise ValueError("unknown data type configuration", dtype_config)
-
-    @staticmethod
-    def print_infos(count_tests=True):
-        # Print infos about tests and cuDNN supported algorithms and configurations.
-        test_2d = TestDnnConv2D()
-        test_3d = TestDnnConv3D()
-        print()
-        print(
-            "Available data type configurations:",
-            ", ".join(
-                CheckDnn.dtype_config_to_str(d)
-                for d in cudnn.get_supported_dtype_configs(check_dtype_config_support)
-            ),
-        )
-        print()
-        print("2D algorithms:")
-        print("FWD        :", ", ".join(test_2d.fwd_algorithms))
-        print("BWD FILTER :", ", ".join(test_2d.bwd_filter_algorithms))
-        print("BWD DATA   :", ", ".join(test_2d.bwd_data_algorithms))
-        print()
-        print("3D algorithms:")
-        print("FWD        :", ", ".join(test_3d.fwd_algorithms))
-        print("BWD FILTER :", ", ".join(test_3d.bwd_filter_algorithms))
-        print("BWD DATA   :", ", ".join(test_3d.bwd_data_algorithms))
-        print()
-        if count_tests:
-            count_tests_2d = test_2d.get_expected_tcount()
-            count_tests_3d = test_3d.get_expected_tcount()
-            print(count_tests_2d, "conv2D test cases.")
-            print(count_tests_3d, "conv3D test cases.")
-            print("1 supplementary test.")
-            print(count_tests_2d + count_tests_3d + 1, "total conv tests.")
-            print()
-
-    @staticmethod
-    def print_tests():
-        # Print test cases without running them.
-        for test in (TestDnnConv2D(), TestDnnConv3D()):
-            for tcase in test.test_fwd():
-                print(tcase[0].__name__, *tcase[1:])
-            for tcase in test.test_gradinput():
-                print(tcase[0].__name__, *tcase[1:])
-            for tcase in test.test_gradweight():
-                print(tcase[0].__name__, *tcase[1:])
-            for tcase in test.test_fwd_runtime_algorithms():
-                print(tcase[0].__name__, *tcase[1:])
-            for tcase in test.test_gradinput_runtime_algorithms():
-                print(tcase[0].__name__, *tcase[1:])
-            for tcase in test.test_gradweight_runtime_algorithms():
-                print(tcase[0].__name__, *tcase[1:])
-        print(test_true_half_config_support.__name__)
--- a/tests/gpuarray/config.py
+++ b/tests/gpuarray/config.py
-import pytest
-
-import aesara.gpuarray
-import aesara.tensor
-
-
-if aesara.gpuarray.pygpu is None:
-    pytest.skip("pygpu not installed", allow_module_level=True)
-
-
-init_error = None
-if not aesara.gpuarray.pygpu_activated and not aesara.config.force_device:
-    try:
-        aesara.gpuarray.init_dev("cuda")
-    except Exception as e:
-        init_error = e
-
-if not aesara.gpuarray.pygpu_activated:
-    if init_error:
-        pytest.skip(str(init_error), allow_module_level=True)
-    else:
-        pytest.skip("pygpu disabled", allow_module_level=True)
-
-test_ctx_name = None
-
-if aesara.config.mode == "FAST_COMPILE":
-    mode_with_gpu = (
-        aesara.compile.mode.get_mode("FAST_RUN").including("gpuarray").excluding("gpu")
-    )
-    mode_without_gpu = aesara.compile.mode.get_mode("FAST_RUN").excluding("gpuarray")
-else:
-    mode_with_gpu = (
-        aesara.compile.mode.get_default_mode().including("gpuarray").excluding("gpu")
-    )
-    mode_without_gpu = aesara.compile.mode.get_default_mode().excluding("gpuarray")
-    mode_without_gpu.check_py_code = False
-
-
-# If using float16, cast reference input to float32
-def ref_cast(x):
-    if x.type.dtype == "float16":
-        x = aesara.tensor.cast(x, "float32")
-    return x
--- a/tests/gpuarray/rnn_support.py
+++ b/tests/gpuarray/rnn_support.py
-import numpy as np
-
-import aesara
-from aesara.tensor.math import dot, sigmoid, tanh
-
-
-class Model:
-    def __init__(self, name=""):
-        self.name = name
-        self.layers = []
-        self.params = []
-        self.other_updates = {}
-
-    def add_layer(self, layer):
-        self.layers.append(layer)
-        for p in layer.params:
-            self.params.append(p)
-
-        if hasattr(layer, "other_updates"):
-            for y in layer.other_updates:
-                self.other_updates[y[0]] = y[1]
-
-    def get_params(self):
-        return self.params
-
-
-def uniform(stdev, size):
-    """uniform distribution with the given stdev and size"""
-    return np.random.uniform(
-        low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size
-    ).astype(aesara.config.floatX)
-
-
-def linear_transform_weights(input_dim, output_dim, param_list=None, name=""):
-    "aesara shared variable given input and output dimension"
-    weight_inialization = uniform(np.sqrt(2.0 / input_dim), (input_dim, output_dim))
-    W = aesara.shared(weight_inialization, name=name)
-
-    assert param_list is not None
-
-    param_list.append(W)
-    return W
-
-
-def bias_weights(length, param_list=None, name=""):
-    "aesara shared variable for bias unit, given length"
-    bias_initialization = np.zeros(length).astype(aesara.config.floatX)
-
-    bias = aesara.shared(bias_initialization, name=name)
-
-    if param_list is not None:
-        param_list.append(bias)
-
-    return bias
-
-
-class Layer:
-    """Generic Layer Template which all layers should inherit"""
-
-    def __init__(self, name=""):
-        self.name = name
-        self.params = []
-
-    def get_params(self):
-        return self.params
-
-
-class GRU(Layer):
-    def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
-        """Layers information"""
-        self.name = name
-        self.input_dim = input_dim
-        self.hidden_dim = output_dim
-        self.output_dim = output_dim
-        self.input_layer = input_layer
-        self.X = input_layer.output()
-        self.s0 = s0
-        self.params = []
-
-        """Layers weights"""
-
-        """self.params is passed so that any parameters could be appended to it"""
-        self.W_r = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_r"
-        )
-        self.b_wr = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wr"
-        )
-
-        self.W_i = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_i"
-        )
-        self.b_wi = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wi"
-        )
-
-        self.W_h = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_h"
-        )
-        self.b_wh = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wh"
-        )
-
-        self.R_r = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_r"
-        )
-        self.b_rr = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_rr"
-        )
-
-        self.R_i = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_i"
-        )
-        self.b_ru = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_ru"
-        )
-
-        self.R_h = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_h"
-        )
-        self.b_rh = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_rh"
-        )
-
-        """step through processed input to create output"""
-
-        def step(inp, s_prev):
-            i_t = sigmoid(
-                dot(inp, self.W_i) + dot(s_prev, self.R_i) + self.b_wi + self.b_ru
-            )
-            r_t = sigmoid(
-                dot(inp, self.W_r) + dot(s_prev, self.R_r) + self.b_wr + self.b_rr
-            )
-
-            h_hat_t = tanh(
-                dot(inp, self.W_h)
-                + (r_t * (dot(s_prev, self.R_h) + self.b_rh))
-                + self.b_wh
-            )
-
-            s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
-
-            return s_curr
-
-        outputs_info = self.s0
-
-        states, updates = aesara.scan(
-            fn=step, sequences=[self.X], outputs_info=outputs_info
-        )
-
-        self.Y = states
-
-    def output(self):
-        return self.Y
-
-
-class LSTM(Layer):
-    def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None, name=""):
-        """Layers information"""
-        self.name = name
-        self.input_dim = input_dim
-        self.hidden_dim = output_dim
-        self.output_dim = output_dim
-        self.input_layer = input_layer
-        self.X = input_layer.output()
-        self.s0 = s0
-        self.c0 = c0
-        self.params = []
-
-        """Layers weights"""
-
-        """self.params is passed so that any parameters could be appended to it"""
-        self.W_i = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_i"
-        )
-        self.b_wi = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wi"
-        )
-
-        self.W_f = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_f"
-        )
-        self.b_wf = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wf"
-        )
-
-        self.W_c = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_c"
-        )
-        self.b_wc = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wc"
-        )
-
-        self.W_o = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W_o"
-        )
-        self.b_wo = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_wo"
-        )
-
-        self.R_i = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_i"
-        )
-        self.b_ri = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_ri"
-        )
-
-        self.R_f = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_f"
-        )
-        self.b_rf = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_rf"
-        )
-
-        self.R_c = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_c"
-        )
-        self.b_rc = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_rc"
-        )
-
-        self.R_o = linear_transform_weights(
-            output_dim, output_dim, param_list=self.params, name=name + ".R_o"
-        )
-        self.b_ro = bias_weights(
-            (output_dim,), param_list=self.params, name=name + ".b_ro"
-        )
-
-        """step through processed input to create output"""
-
-        def step(x_t, h_tm1, c_tm1):
-            i_t = sigmoid(
-                dot(x_t, self.W_i) + dot(h_tm1, self.R_i) + self.b_wi + self.b_ri
-            )
-            f_t = sigmoid(
-                dot(x_t, self.W_f) + dot(h_tm1, self.R_f) + self.b_wf + self.b_rf
-            )
-            o_t = sigmoid(
-                dot(x_t, self.W_o) + dot(h_tm1, self.R_o) + self.b_ro + self.b_wo
-            )
-
-            c_hat_t = tanh(
-                dot(x_t, self.W_c) + dot(h_tm1, self.R_c) + self.b_wc + self.b_rc
-            )
-            c_t = f_t * c_tm1 + i_t * c_hat_t
-            h_t = o_t * tanh(c_t)
-
-            return h_t, c_t
-
-        outputs_info = [self.s0, self.c0]
-
-        states, updates = aesara.scan(
-            fn=step, sequences=[self.X], outputs_info=outputs_info
-        )
-
-        self.Y = states[0]
-        self.C = states[1]
-
-    def output(self):
-        return self.Y
-
-
-class FC(Layer):
-    def __init__(self, input_dim, output_dim, input_layer, name=""):
-        self.input_layer = input_layer
-        self.name = name
-        self.params = []
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.X = self.input_layer.output()
-
-        self.W = linear_transform_weights(
-            input_dim, output_dim, param_list=self.params, name=name + ".W"
-        )
-        self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
-
-    def output(self):
-        return dot(self.X, self.W) + self.b
-
-
-class WrapperLayer(Layer):
-    def __init__(self, X, name=""):
-        self.params = []
-        self.name = name
-        self.X = X
-
-    def output(self):
-        return self.X
--- a/tests/gpuarray/run_dnn_conv.py
+++ b/tests/gpuarray/run_dnn_conv.py
-# This script allows to run one specific cuDNN convolution test case.
-# This script should not be imported, but only used as a program.
-# python run_dnn_conv.py --help         # Print help.
-# python run_dnn_conv.py {fwd|bwd-filter|bwd-data} {2d|3d} -a <algo> -i <inputShape> -f <filterShape> ...
-import argparse
-import sys
-
-import aesara
-from aesara.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
-from aesara.gpuarray.cudnn_defs import (
-    DOUBLE,
-    DOUBLE_CONFIG,
-    FLOAT,
-    FLOAT_CONFIG,
-    HALF,
-    PSEUDO_HALF_CONFIG,
-    TRUE_HALF_CONFIG,
-)
-from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
-from tests.gpuarray.check_dnn_conv import CheckDnn, TestDnnConv2D, TestDnnConv3D, cudnn
-
-
-if __name__ != "__main__":
-    raise ImportError("This script must not be imported.")
-
-
-class TupleAction(argparse.Action):
-    # Tuple extractor for command line args parser.
-    def __call__(self, parser, namespace, values, option_string=None):
-        values = tuple(int(v) for v in values.split(","))
-        setattr(namespace, self.dest, values)
-
-
-class BorderAction(TupleAction):
-    # Border extractor for command line args parser.
-    def __call__(self, parser, namespace, values, option_string=None):
-        if values not in ("valid", "full", "half"):
-            super().__call__(parser, namespace, values, option_string)
-        else:
-            setattr(namespace, self.dest, values)
-
-
-args = sys.argv[1:]
-computations = FWD, BWD_FILTER, BWD_DATA = ("fwd", "gradweight", "gradinput")
-algorithms = (
-    tuple(
-        sorted(
-            list(
-                set(
-                    cudnn.cudnnConvolutionFwdAlgo_t.get_aliases()
-                    + cudnn.cudnnConvolutionBwdFilterAlgo_t.get_aliases()
-                    + cudnn.cudnnConvolutionBwdDataAlgo_t.get_aliases()
-                )
-            )
-        )
-    )
-    + SUPPORTED_DNN_CONV_ALGO_RUNTIME
-)
-types = (HALF, FLOAT, DOUBLE)
-data_type_configurations = dict(
-    TRUE_HALF_CONFIG=TRUE_HALF_CONFIG,
-    PSEUDO_HALF_CONFIG=PSEUDO_HALF_CONFIG,
-    FLOAT_CONFIG=FLOAT_CONFIG,
-    DOUBLE_CONFIG=DOUBLE_CONFIG,
-)
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument("computation", choices=computations, help="Computation to run.")
-
-parser.add_argument(
-    "-a",
-    "--algo",
-    choices=algorithms,
-    required=True,
-    help="Algorithm to use for computation.",
-)
-parser.add_argument(
-    "-i",
-    "--input-shape",
-    action=TupleAction,
-    required=True,
-    help="Input shape. Comma-separated list of integers (no spaces).",
-)
-parser.add_argument(
-    "-f",
-    "--filter-shape",
-    action=TupleAction,
-    required=True,
-    help="Filter shape. Comma-separated list of integers (no spaces).",
-)
-
-parser.add_argument(
-    "-D",
-    "--dtype-config",
-    choices=list(sorted(data_type_configurations.keys())),
-    default=None,
-    help="Data type configuration for (data type; precision). Default (aesara floatX; aesara floatX). "
-    "To specify data type configuration, you can either use this option or set data type and "
-    'precision separately with "-t" and "-p" options.',
-)
-parser.add_argument(
-    "-t",
-    "--dtype",
-    choices=types,
-    default=None,
-    help="Data type (default aesara floatX).",
-)
-parser.add_argument(
-    "-p",
-    "--precision",
-    choices=types,
-    default=None,
-    help="Precision (default aesara floatX).",
-)
-parser.add_argument(
-    "-s",
-    "--subsample",
-    action=TupleAction,
-    help="Subsample. Comma-separated list of integers (no spaces). "
-    "Default: 1 per dimension.",
-)
-parser.add_argument(
-    "-d",
-    "--dilation",
-    action=TupleAction,
-    help="Dilation. Comma-separated list of integers (no spaces). "
-    "Default: 1 per dimension.",
-)
-parser.add_argument(
-    "-b",
-    "--border-mode",
-    default="valid",
-    action=BorderAction,
-    help='Border mode. "valid" (default), "full", "half" '
-    "or a comma-separated list of integers (no spaces).",
-)
-parser.add_argument(
-    "-c",
-    "--conv-mode",
-    choices=("conv", "cross"),
-    default="conv",
-    help="Conv mode (default: conv).",
-)
-parser.add_argument(
-    "-A",
-    "--alpha",
-    type=float,
-    default=1,
-    help="alpha (floating), must not be zero. Default 1.",
-)
-parser.add_argument(
-    "-B", "--beta", type=float, default=0, help="beta (floating). Default 0."
-)
-
-parser.add_argument(
-    "-I",
-    "--print-infos",
-    action="store_true",
-    default=False,
-    help="Print some infos before testing.",
-)
-
-args = parser.parse_args(args)
-
-test = args.computation
-if len(args.input_shape) != len(args.filter_shape):
-    raise ValueError("Expected same length for input shape and filter shape")
-if len(args.input_shape) not in (4, 5):
-    raise ValueError("Expected length 4 or 5 for input shape")
-ndim = len(args.input_shape) - 2
-if ndim == 2:
-    tests = TestDnnConv2D()
-elif ndim == 3:
-    tests = TestDnnConv3D()
-
-if args.subsample is None:
-    args.subsample = (1,) * ndim
-if args.dilation is None:
-    args.dilation = (1,) * ndim
-if not (ndim == len(args.subsample) == len(args.dilation)):
-    raise ValueError(f"Expected parameters sized for {int(ndim)} dimensions.")
-
-if isinstance(args.border_mode, tuple) and ndim != len(args.border_mode):
-    raise ValueError(f"Expected borders sized for {int(ndim)} dimensions.")
-
-if args.alpha == 0:
-    raise ValueError("Nothing could be computed if alpha is 0.")
-
-if args.dtype_config is None:
-    if args.dtype is None:
-        args.dtype = aesara.config.floatX
-    if args.precision is None:
-        args.precision = aesara.config.floatX
-else:
-    if args.dtype is not None or args.precision is not None:
-        raise ValueError(
-            "You must specify either -D <data-type-configuration> "
-            "or (-t <data-type> -p <precision>), not both."
-        )
-    args.dtype, args.precision = data_type_configurations[args.dtype_config]
-if (args.dtype, args.precision) not in cudnn.get_supported_dtype_configs():
-    raise ValueError(
-        f"Unsupported data type configuration {args.dtype} {args.precision}."
-    )
-
-if args.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
-    check_config = False
-    if test == FWD:
-        check_config = cudnn.fwd_algo_supports_dtype_config(
-            args.algo, args.dtype, args.precision, ndim
-        )
-    elif test == BWD_FILTER:
-        check_config = cudnn.bwd_filter_algo_supports_dtype_config(
-            args.algo, args.dtype, args.precision, ndim
-        )
-    elif test == BWD_DATA:
-        check_config = cudnn.bwd_data_algo_supports_dtype_config(
-            args.algo, args.dtype, args.precision, ndim
-        )
-    if not check_config:
-        print(
-            "Warning: %s computation does not normally support configuration (%s, %s) for algo %s."
-            % (test, args.dtype, args.precision, args.algo),
-            file=sys.stderr,
-        )
-
-algo = args.algo
-dtype = args.dtype
-precision = args.precision
-parameters = (
-    args.input_shape,
-    args.filter_shape,
-    args.subsample,
-    args.dilation,
-    args.border_mode,
-    args.conv_mode,
-    args.alpha,
-    args.beta,
-)
-if args.print_infos:
-    CheckDnn.print_infos(count_tests=False)
-print("======================")
-print("Running", test, algo, dtype, precision, *parameters)
-if test == FWD:
-    tests.run_conv_fwd(algo, dtype, precision, parameters)
-    expected_output_shape = get_conv_output_shape(
-        args.input_shape,
-        args.filter_shape,
-        args.border_mode,
-        args.subsample,
-        args.dilation,
-    )
-elif test == BWD_FILTER:
-    tests.run_conv_gradweight(algo, dtype, precision, parameters)
-    expected_output_shape = args.filter_shape
-elif test == BWD_DATA:
-    tests.run_conv_gradinput(algo, dtype, precision, parameters)
-    expected_output_shape = args.input_shape
-print("Computed shape:", expected_output_shape)
-print("... OK")
--- a/tests/gpuarray/test_abstractconv.py
+++ b/tests/gpuarray/test_abstractconv.py
-import numpy as np
-import pytest
-
-
-pygpu = pytest.importorskip("pygpu")
-gpuarray = pygpu.gpuarray
-
-from aesara.gpuarray.blas import (
-    GpuCorr3dMM,
-    GpuCorr3dMM_gradInputs,
-    GpuCorr3dMM_gradWeights,
-    GpuCorrMM,
-    GpuCorrMM_gradInputs,
-    GpuCorrMM_gradWeights,
-)
-from aesara.gpuarray.dnn import (
-    GpuDnnConv,
-    GpuDnnConvGradI,
-    GpuDnnConvGradW,
-    dnn_available,
-)
-from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-from tests.tensor.nnet.test_abstract_conv import (
-    BaseTestConv2d,
-    BaseTestConv3d,
-    TestConv2dTranspose,
-    TestConvTypes,
-)
-
-
-gpu_ftensor4 = GpuArrayType(dtype="float32", broadcastable=(False,) * 4)
-
-
-class TestDnnConv2d(BaseTestConv2d):
-    @classmethod
-    def setup_class(cls):
-        super().setup_class()
-        cls.shared = staticmethod(gpuarray_shared_constructor)
-        # provide_shape is not used by the cuDNN implementation
-        cls.provide_shape = [False]
-
-    @pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
-        mode = mode_with_gpu
-
-        if fd != (1, 1):
-            pytest.skip("Doesn't have CUDNN implementation")
-
-        o = self.get_output_shape(i, f, s, b, fd)
-
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuDnnConv,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuDnnConvGradW,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuDnnConvGradI,
-        )
-
-    @pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
-    ):
-        if fd != (1, 1):
-            pytest.skip("Doesn't have CUDNN implementation")
-
-        mode = mode_with_gpu
-
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                mode=mode,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=GpuDnnConvGradI,
-                filter_dilation=fd,
-            )
-        else:
-            with pytest.raises((RuntimeError, ValueError)):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    mode=mode,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=GpuDnnConvGradI,
-                    ref=None,
-                    filter_dilation=fd,
-                )
-
-
-class TestDnnConv3d(BaseTestConv3d):
-    @classmethod
-    def setup_class(cls):
-        super().setup_class()
-        cls.shared = staticmethod(gpuarray_shared_constructor)
-        # provide_shape is not used by the cuDNN implementation
-        cls.provide_shape = [False]
-
-    @pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
-        mode = mode_with_gpu
-
-        if fd != (1, 1, 1):
-            pytest.skip("Doesn't have CUDNN implementation")
-
-        o = self.get_output_shape(i, f, s, b, fd)
-
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuDnnConv,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuDnnConvGradW,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuDnnConvGradI,
-        )
-
-    @pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False
-    ):
-        if fd != (1, 1, 1):
-            pytest.skip("Doesn't have CUDNN implementation")
-
-        mode = mode_with_gpu
-
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                mode=mode,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=GpuDnnConvGradI,
-                filter_dilation=fd,
-            )
-        else:
-            with pytest.raises((RuntimeError, ValueError)):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    mode=mode,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=GpuDnnConvGradI,
-                    ref=None,
-                    filter_dilation=fd,
-                )
-
-
-class TestCorrMMConv2d(BaseTestConv2d):
-    @classmethod
-    def setup_class(cls):
-        super().setup_class()
-        cls.shared = staticmethod(gpuarray_shared_constructor)
-        cls.mode = mode_with_gpu.excluding("cudnn")
-
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
-        mode = self.mode
-        o = self.get_output_shape(i, f, s, b, fd)
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=(GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs),
-            filter_dilation=fd,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuCorrMM_gradWeights,
-            filter_dilation=fd,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuCorrMM_gradInputs,
-            filter_dilation=fd,
-        )
-
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
-    ):
-        mode = self.mode
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                mode=mode,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=GpuCorrMM_gradInputs,
-                filter_dilation=fd,
-            )
-        else:
-            with pytest.raises(ValueError):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    mode=mode,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=GpuCorrMM_gradInputs,
-                    ref=None,
-                    filter_dilation=fd,
-                )
-
-
-class TestCorrMMConv3d(BaseTestConv3d):
-    @classmethod
-    def setup_class(cls):
-        super().setup_class()
-        cls.shared = staticmethod(gpuarray_shared_constructor)
-        cls.mode = mode_with_gpu.excluding("cudnn")
-
-    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
-        mode = self.mode
-        o = self.get_output_shape(i, f, s, b, fd)
-        self.run_fwd(
-            inputs_shape=i,
-            filters_shape=f,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=(GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs),
-            filter_dilation=fd,
-        )
-        self.run_gradweight(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuCorr3dMM_gradWeights,
-            filter_dilation=fd,
-        )
-        self.run_gradinput(
-            inputs_shape=i,
-            filters_shape=f,
-            output_shape=o,
-            subsample=s,
-            verify_grad=True,
-            mode=mode,
-            provide_shape=provide_shape,
-            border_mode=b,
-            filter_flip=flip,
-            target_op=GpuCorr3dMM_gradInputs,
-            filter_dilation=fd,
-        )
-
-    def run_test_case_gi(
-        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False
-    ):
-        mode = self.mode
-        if not expect_error:
-            self.run_gradinput(
-                inputs_shape=i,
-                filters_shape=f,
-                output_shape=o,
-                subsample=s,
-                verify_grad=True,
-                mode=mode,
-                provide_shape=provide_shape,
-                border_mode=b,
-                filter_flip=flip,
-                target_op=GpuCorr3dMM_gradInputs,
-                filter_dilation=fd,
-            )
-        else:
-            with pytest.raises(ValueError):
-                self.run_gradinput(
-                    inputs_shape=i,
-                    filters_shape=f,
-                    output_shape=o,
-                    subsample=s,
-                    verify_grad=False,
-                    mode=mode,
-                    provide_shape=provide_shape,
-                    border_mode=b,
-                    filter_flip=flip,
-                    target_op=GpuCorr3dMM_gradInputs,
-                    ref=None,
-                    filter_dilation=fd,
-                )
-
-
-class TestDnnConvTypes(TestConvTypes):
-    def setup_method(self):
-        self.input = gpu_ftensor4()
-        self.filters = gpu_ftensor4()
-        self.topgrad = gpu_ftensor4()
-        self.constant_tensor = gpuarray.array(
-            np.zeros((3, 5, 7, 11), dtype="float32"), context=get_context(test_ctx_name)
-        )
-        super().setup_method()
-
-
-class TestConv2dTranspose(TestConv2dTranspose):
-    mode = mode_with_gpu
--- a/tests/gpuarray/test_basic_ops.py
+++ b/tests/gpuarray/test_basic_ops.py
-import numpy as np
-import pytest
-
-import aesara
-import aesara.tensor as at
-from aesara.gpuarray.basic_ops import (
-    GpuAlloc,
-    GpuAllocEmpty,
-    GpuContiguous,
-    GpuEye,
-    GpuFromHost,
-    GpuJoin,
-    GpuReshape,
-    GpuSplit,
-    GpuToGpu,
-    GpuTri,
-    HostFromGpu,
-    gpu_contiguous,
-    gpu_join,
-    host_from_gpu,
-)
-from aesara.gpuarray.elemwise import GpuDimShuffle, GpuElemwise
-from aesara.gpuarray.subtensor import GpuSubtensor
-from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
-from aesara.tensor.basic import Alloc, MakeVector, Split, alloc
-from aesara.tensor.shape import Shape, Shape_i
-from aesara.tensor.type import TensorType, fmatrix, iscalar, lscalar, matrix
-
-# Don't import test classes otherwise they get tested as part of the file
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, test_ctx_name
-from tests.tensor.test_basic import (
-    TestAlloc,
-    TestComparison,
-    TestJoinAndSplit,
-    TestReshape,
-)
-from tests.tensor.utils import random, safe_make_node
-
-
-pygpu = pytest.importorskip("pygpu")
-gpuarray = pygpu.gpuarray
-
-rng = np.random.default_rng(seed=utt.fetch_seed())
-
-
-def inplace_func(
-    inputs,
-    outputs,
-    mode=None,
-    allow_input_downcast=False,
-    on_unused_input="raise",
-    name=None,
-):
-    if mode is None:
-        mode = mode_with_gpu
-    return aesara.function(
-        inputs,
-        outputs,
-        mode=mode,
-        allow_input_downcast=allow_input_downcast,
-        accept_inplace=True,
-        on_unused_input=on_unused_input,
-        name=name,
-    )
-
-
-def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
-    from aesara.tensor.sharedvar import scalar_constructor, tensor_constructor
-
-    for c in (gpuarray_shared_constructor, tensor_constructor, scalar_constructor):
-        try:
-            return c(
-                value, name=name, strict=strict, allow_downcast=allow_downcast, **kwargs
-            )
-        except TypeError:
-            continue
-
-
-def rand_gpuarray(*shape, **kwargs):
-    r = rng.random(shape) * 2 - 1
-    dtype = kwargs.pop("dtype", aesara.config.floatX)
-    cls = kwargs.pop("cls", None)
-    if len(kwargs) != 0:
-        raise TypeError("Unexpected argument %s", list(kwargs.keys())[0])
-    return gpuarray.array(r, dtype=dtype, cls=cls, context=get_context(test_ctx_name))
-
-
-def makeTester(
-    name,
-    op,
-    gpu_op,
-    cases,
-    checks=None,
-    mode_gpu=mode_with_gpu,
-    mode_nogpu=mode_without_gpu,
-    skip=False,
-    eps=1e-10,
-):
-    if checks is None:
-        checks = {}
-
-    _op = op
-    _gpu_op = gpu_op
-    _cases = cases
-    _skip = skip
-    _checks = checks
-
-    class Checker(utt.OptimizationTestMixin):
-        op = staticmethod(_op)
-        gpu_op = staticmethod(_gpu_op)
-        cases = _cases
-        skip = _skip
-        checks = _checks
-
-        def setup_method(self):
-            eval(self.__class__.__module__ + "." + self.__class__.__name__)
-
-        def test_all(self):
-            if skip:
-                pytest.skip(skip)
-
-            for testname, inputs in cases.items():
-                for _ in range(len(inputs)):
-                    if type(inputs[_]) is float:
-                        inputs[_] = np.asarray(inputs[_], dtype=aesara.config.floatX)
-                self.run_case(testname, inputs)
-
-        def run_case(self, testname, inputs):
-            inputs_ref = [aesara.shared(inp) for inp in inputs]
-            inputs_tst = [aesara.shared(inp) for inp in inputs]
-
-            try:
-                node_ref = safe_make_node(self.op, *inputs_ref)
-                node_tst = safe_make_node(self.op, *inputs_tst)
-            except Exception as exc:
-                err_msg = (
-                    "Test %s::%s: Error occurred while making " "a node with inputs %s"
-                ) % (self.gpu_op, testname, inputs)
-                exc.args += (err_msg,)
-                raise
-
-            try:
-                f_ref = inplace_func([], node_ref.outputs, mode=mode_nogpu)
-                f_tst = inplace_func([], node_tst.outputs, mode=mode_gpu)
-            except Exception as exc:
-                err_msg = (
-                    "Test %s::%s: Error occurred while trying to " "make a Function"
-                ) % (self.gpu_op, testname)
-                exc.args += (err_msg,)
-                raise
-
-            self.assertFunctionContains1(f_tst, self.gpu_op)
-
-            ref_e = None
-            try:
-                expecteds = f_ref()
-            except Exception as exc:
-                ref_e = exc
-
-            try:
-                variables = f_tst()
-            except Exception as exc:
-                if ref_e is None:
-                    err_msg = (
-                        "Test %s::%s: exception when calling the " "Function"
-                    ) % (self.gpu_op, testname)
-                    exc.args += (err_msg,)
-                    raise
-                else:
-                    # if we raised an exception of the same type we're good.
-                    if isinstance(exc, type(ref_e)):
-                        return
-                    else:
-                        err_msg = (
-                            "Test %s::%s: exception raised during test "
-                            "call was not the same as the reference "
-                            "call (got: %s, expected %s)"
-                            % (self.gpu_op, testname, type(exc), type(ref_e))
-                        )
-                        exc.args += (err_msg,)
-                        raise
-
-            for i, (variable, expected) in enumerate(zip(variables, expecteds)):
-                condition = (
-                    variable.dtype != expected.dtype
-                    or variable.shape != expected.shape
-                    or not TensorType.values_eq_approx(variable, expected)
-                )
-                assert not condition, (
-                    "Test %s::%s: Output %s gave the wrong "
-                    "value. With inputs %s, expected %s "
-                    "(dtype %s), got %s (dtype %s)."
-                    % (
-                        self.op,
-                        testname,
-                        i,
-                        inputs,
-                        expected,
-                        expected.dtype,
-                        variable,
-                        variable.dtype,
-                    )
-                )
-
-            for description, check in self.checks.items():
-                assert check(inputs, variables), (
-                    "Test %s::%s: Failed check: %s " "(inputs were %s, outputs were %s)"
-                ) % (self.op, testname, description, inputs, variables)
-
-    Checker.__name__ = name
-    if hasattr(Checker, "__qualname__"):
-        Checker.__qualname__ = name
-    return Checker
-
-
-def test_transfer_cpu_gpu():
-    a = fmatrix("a")
-    g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g")
-
-    av = np.asarray(rng.random((5, 4)), dtype="float32")
-    gv = gpuarray.array(av, context=get_context(test_ctx_name))
-
-    f = aesara.function([a], GpuFromHost(test_ctx_name)(a))
-    fv = f(av)
-    assert GpuArrayType.values_eq(fv, gv)
-
-    f = aesara.function([g], host_from_gpu(g))
-    fv = f(gv)
-    assert np.all(fv == av)
-
-
-def test_transfer_gpu_gpu():
-    g = GpuArrayType(
-        dtype="float32", broadcastable=(False, False), context_name=test_ctx_name
-    )()
-
-    av = np.asarray(rng.random((5, 4)), dtype="float32")
-    gv = gpuarray.array(av, context=get_context(test_ctx_name))
-    mode = mode_with_gpu.excluding(
-        "cut_gpua_host_transfers", "local_cut_gpua_host_gpua"
-    )
-    f = aesara.function([g], GpuToGpu(test_ctx_name)(g), mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuToGpu)
-    fv = f(gv)
-    assert GpuArrayType.values_eq(fv, gv)
-
-
-def test_transfer_strided():
-    # This is just to ensure that it works in aesara
-    # libgpuarray has a much more comprehensive suit of tests to
-    # ensure correctness
-    a = fmatrix("a")
-    g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g")
-
-    av = np.asarray(rng.random((5, 8)), dtype="float32")
-    gv = gpuarray.array(av, context=get_context(test_ctx_name))
-
-    av = av[:, ::2]
-    gv = gv[:, ::2]
-
-    f = aesara.function([a], GpuFromHost(test_ctx_name)(a))
-    fv = f(av)
-    assert GpuArrayType.values_eq(fv, gv)
-
-    f = aesara.function([g], host_from_gpu(g))
-    fv = f(gv)
-    assert np.all(fv == av)
-
-
-def gpu_alloc_expected(x, *shp):
-    g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
-    g[:] = x
-    return g
-
-
-TestGpuAlloc = makeTester(
-    name="GpuAllocTester",
-    # The +1 is there to allow the lift to the GPU.
-    op=lambda *args: alloc(*args) + 1,
-    gpu_op=GpuAlloc(test_ctx_name),
-    cases=dict(
-        correct01=(random(), np.int32(7)),
-        # just gives a DeepCopyOp with possibly wrong results on the CPU
-        # correct01_bcast=(random(1), np.int32(7)),
-        correct02=(random(), np.int32(4), np.int32(7)),
-        correct12=(random(7), np.int32(4), np.int32(7)),
-        correct13=(random(7), np.int32(2), np.int32(4), np.int32(7)),
-        correct23=(random(4, 7), np.int32(2), np.int32(4), np.int32(7)),
-        bad_shape12=(random(7), np.int32(7), np.int32(5)),
-    ),
-)
-
-
-class TestGPUAlloc(TestAlloc):
-    dtype = "float32"
-    mode = mode_with_gpu
-    shared = staticmethod(gpuarray_shared_constructor)
-    allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), Alloc()]
-
-
-def test_alloc_empty():
-    for dt in ["float32", "int8"]:
-        f = aesara.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
-        assert len(f.maker.fgraph.apply_nodes) == 1
-        out = f()
-        assert out.shape == (2, 3)
-        assert out.dtype == dt
-
-    f = aesara.function(
-        [],
-        [
-            GpuAllocEmpty("uint64", test_ctx_name)(3, 2),
-            GpuAllocEmpty("uint64", test_ctx_name)(3, 2),
-        ],
-    )
-    out = f()
-    assert out[0].shape == (3, 2)
-    assert out[0].dtype == "uint64"
-    assert out[1].shape == (3, 2)
-    assert out[1].dtype == "uint64"
-    assert (
-        len(
-            [
-                node
-                for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, GpuAllocEmpty)
-            ]
-        )
-        == 1
-    )
-
-
-def test_shape():
-    x = GpuArrayType(dtype="float32", broadcastable=[False, False, False])()
-    v = gpuarray.zeros((3, 4, 5), dtype="float32", context=get_context(test_ctx_name))
-    f = aesara.function([x], x.shape)
-    topo = f.maker.fgraph.toposort()
-    assert np.all(f(v) == (3, 4, 5))
-    if aesara.config.mode != "FAST_COMPILE":
-        assert len(topo) == 4
-        assert isinstance(topo[0].op, Shape_i)
-        assert isinstance(topo[1].op, Shape_i)
-        assert isinstance(topo[2].op, Shape_i)
-        assert isinstance(topo[3].op, MakeVector)
-    mode = mode_with_gpu.excluding("local_shape_to_shape_i")
-    f = aesara.function([x], x.shape, mode=mode)
-    topo = f.maker.fgraph.toposort()
-    assert np.all(f(v) == (3, 4, 5))
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, Shape)
-
-
-def test_gpu_contiguous():
-    a = fmatrix("a")
-    i = iscalar("i")
-    a_val = np.asarray(np.random.random(4, 5), dtype="float32")
-    # The reshape is needed otherwise we make the subtensor on the CPU
-    # to transfer less data.
-    f = aesara.function(
-        [a, i], gpu_contiguous(a.reshape((5, 4))[::i]), mode=mode_with_gpu
-    )
-    topo = f.maker.fgraph.toposort()
-    assert any(isinstance(node.op, GpuSubtensor) for node in topo)
-    assert any(isinstance(node.op, GpuContiguous) for node in topo)
-    assert f(a_val, 1).flags.c_contiguous
-    assert f(a_val, 2).flags.c_contiguous
-    assert f(a_val, 2).flags.c_contiguous
-
-
-class TestGPUReshape(TestReshape):
-    def setup_method(self):
-        self.shared = gpuarray_shared_constructor
-        self.op = GpuReshape
-        self.mode = mode_with_gpu
-        self.ignore_topo = (
-            HostFromGpu,
-            GpuFromHost,
-            aesara.compile.DeepCopyOp,
-            GpuDimShuffle,
-            GpuElemwise,
-            Shape_i,
-            MakeVector,
-        )
-        assert self.op == GpuReshape
-
-
-class TestGPUComparison(TestComparison):
-    def setup_method(self):
-        self.mode = mode_with_gpu
-        self.shared = gpuarray_shared_constructor
-        self.dtypes = ["float64", "float32"]
-
-
-class TestGPUJoinAndSplit(TestJoinAndSplit):
-    def setup_method(self):
-        self.mode = mode_with_gpu.excluding("constant_folding")
-        self.join_op = GpuJoin()
-        self.split_op_class = GpuSplit
-        # Use join instead of MakeVector since there is no MakeVector on GPU
-        self.make_vector_op = GpuJoin()
-        # this is to avoid errors with limited devices
-        self.floatX = "float32"
-        self.hide_error = aesara.config.mode not in ["DebugMode", "DEBUG_MODE"]
-
-        def shared(x, **kwargs):
-            return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
-
-        self.shared = shared
-
-    def test_gpusplit_opt(self):
-        # Test that we move the node to the GPU
-        # Also test float16 computation at the same time.
-        rng = np.random.default_rng(seed=utt.fetch_seed())
-        m = self.shared(rng.random((4, 6)).astype("float16"))
-        o = Split(2)(m, 0, [2, 2])
-        assert o[0].dtype == "float16"
-        f = aesara.function([], o, mode=self.mode)
-        assert any(
-            [
-                isinstance(node.op, self.split_op_class)
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-        o1, o2 = f()
-        assert np.allclose(o1, m.get_value(borrow=True)[:2])
-        assert np.allclose(o2, m.get_value(borrow=True)[2:])
-
-
-def test_gpujoin_gpualloc():
-    a = fmatrix("a")
-    a_val = np.asarray(np.random.random(4, 5), dtype="float32")
-    b = fmatrix("b")
-    b_val = np.asarray(np.random.random(3, 5), dtype="float32")
-
-    f = aesara.function(
-        [a, b],
-        at.join(0, at.zeros_like(a), at.ones_like(b)) + 4,
-        mode=mode_without_gpu,
-    )
-    f_gpu = aesara.function(
-        [a, b], at.join(0, at.zeros_like(a), at.ones_like(b)), mode=mode_with_gpu
-    )
-    f_gpu2 = aesara.function(
-        [a, b], at.join(0, at.zeros_like(a), at.ones_like(b)) + 4, mode=mode_with_gpu
-    )
-    assert sum([node.op == at.alloc for node in f.maker.fgraph.toposort()]) == 2
-    assert sum([node.op == at.join_ for node in f.maker.fgraph.toposort()]) == 1
-    assert (
-        sum([isinstance(node.op, GpuAlloc) for node in f_gpu.maker.fgraph.toposort()])
-        == 2
-    )
-    assert sum([node.op == gpu_join for node in f_gpu.maker.fgraph.toposort()]) == 1
-    assert (
-        sum([isinstance(node.op, GpuAlloc) for node in f_gpu2.maker.fgraph.toposort()])
-        == 2
-    )
-    assert sum([node.op == gpu_join for node in f_gpu2.maker.fgraph.toposort()]) == 1
-    assert np.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
-
-
-def test_gpueye():
-    def check(dtype, N, M_=None, k=0):
-        # Aesara does not accept None as a tensor.
-        # So we must use a real value.
-        M = M_
-        # Currently DebugMode does not support None as inputs even if this is
-        # allowed.
-        if M is None:
-            M = N
-        N_symb = iscalar()
-        M_symb = iscalar()
-        k_symb = iscalar()
-        out = at.eye(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
-        f = aesara.function([N_symb, M_symb, k_symb], out, mode=mode_with_gpu)
-
-        result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
-        assert np.allclose(result, np.eye(N, M_, k, dtype=dtype))
-        assert result.dtype == np.dtype(dtype)
-        assert any(isinstance(node.op, GpuEye) for node in f.maker.fgraph.toposort())
-
-    for dtype in ["float32", "int32", "float16"]:
-        check(dtype, 3)
-        # M != N, k = 0
-        check(dtype, 3, 5)
-        check(dtype, 5, 3)
-        # N == M, k != 0
-        check(dtype, 3, 3, 1)
-        check(dtype, 3, 3, -1)
-        # N < M, k != 0
-        check(dtype, 3, 5, 1)
-        check(dtype, 3, 5, -1)
-        # N > M, k != 0
-        check(dtype, 5, 3, 1)
-        check(dtype, 5, 3, -1)
-        # k > M, -k > N, k > M, k > N
-        check(dtype, 5, 3, 3)
-        check(dtype, 3, 5, 3)
-        check(dtype, 5, 3, -3)
-        check(dtype, 3, 5, -3)
-        check(dtype, 5, 3, 6)
-        check(dtype, 3, 5, -6)
-
-
-def test_hostfromgpu_shape_i():
-    # Test that the shape is lifted over hostfromgpu
-
-    m = mode_with_gpu.including(
-        "local_dot_to_dot22", "local_dot22_to_dot22scalar", "specialize"
-    )
-    a = fmatrix("a")
-    ca = aesara.gpuarray.type.GpuArrayType("float32", (False, False))()
-    av = np.asarray(np.random.random(5, 4), dtype="float32")
-    cv = gpuarray.asarray(
-        np.random.random(5, 4), dtype="float32", context=get_context(test_ctx_name)
-    )
-
-    f = aesara.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
-    assert any(isinstance(x.op, GpuFromHost) for x in f.maker.fgraph.toposort())
-    f = aesara.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
-    topo = f.maker.fgraph.toposort()
-    assert isinstance(topo[0].op, Shape_i)
-    assert isinstance(topo[1].op, Shape_i)
-    assert isinstance(topo[2].op, MakeVector)
-    assert tuple(f(av)) == (5, 4)
-
-    f = aesara.function([ca], host_from_gpu(ca), mode=m)
-    assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()]
-    f = aesara.function([ca], host_from_gpu(ca).shape, mode=m)
-    topo = f.maker.fgraph.toposort()
-    assert isinstance(topo[0].op, Shape_i)
-    assert isinstance(topo[1].op, Shape_i)
-    assert isinstance(topo[2].op, MakeVector)
-    assert tuple(f(cv)) == (5, 4)
-
-
-def test_Gpujoin_inplace():
-    # Test Gpujoin to work inplace.
-    #
-    # This function tests the case when several elements are passed to the
-    # Gpujoin function but all except one of them are empty. In this case
-    # Gpujoin should work inplace and the output should be the view of the
-    # non-empty element.
-    s = lscalar()
-    data = np.array([3, 4, 5], dtype=aesara.config.floatX)
-    x = gpuarray_shared_constructor(data, borrow=True)
-    z = at.zeros((s,))
-
-    join = GpuJoin(view=0)
-    c = join(0, x, z)
-
-    f = aesara.function([s], aesara.Out(c, borrow=True))
-    if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
-        assert x.get_value(borrow=True, return_internal_type=True) is f(0)
-    assert np.allclose(f(0), [3, 4, 5])
-
-
-def test_gpu_tril_triu():
-    def check_l(m, k=0):
-        m_symb = matrix(dtype=m.dtype)
-        k_symb = iscalar()
-
-        f = aesara.function(
-            [m_symb, k_symb], at.tril(m_symb, k_symb), mode=mode_with_gpu
-        )
-        result = f(m, k)
-        assert np.allclose(result, np.tril(m, k))
-        assert result.dtype == np.dtype(dtype)
-        assert any(isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort())
-
-    def check_u(m, k=0):
-        m_symb = matrix(dtype=m.dtype)
-        k_symb = iscalar()
-        f = aesara.function(
-            [m_symb, k_symb], at.triu(m_symb, k_symb), mode=mode_with_gpu
-        )
-        result = f(m, k)
-        assert np.allclose(result, np.triu(m, k))
-        assert result.dtype == np.dtype(dtype)
-        assert any(isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort())
-
-    test_rng = np.random.default_rng(seed=utt.fetch_seed())
-
-    for dtype in ["float64", "float32", "float16"]:
-        # try a big one
-        m = np.asarray(test_rng.random((5000, 5000)) * 2 - 1, dtype=dtype)
-        check_l(m, 0)
-        check_l(m, 1)
-        check_l(m, -1)
-
-        check_u(m, 0)
-        check_u(m, 1)
-        check_u(m, -1)
-
-        m = np.asarray(test_rng.random((10, 10)) * 2 - 1, dtype=dtype)
-        check_l(m, 0)
-        check_l(m, 1)
-        check_l(m, -1)
-
-        check_u(m, 0)
-        check_u(m, 1)
-        check_u(m, -1)
-
-        m = np.asarray(test_rng.random((10, 5)) * 2 - 1, dtype=dtype)
-        check_l(m, 0)
-        check_l(m, 1)
-        check_l(m, -1)
-
-        check_u(m, 0)
-        check_u(m, 1)
-        check_u(m, -1)
-
-
-def test_gputri():
-    def check(dtype, N, M_=None, k=0):
-        # Aesara does not accept None as a tensor.
-        # So we must use a real value.
-        M = M_
-        # Currently DebugMode does not support None as inputs even if this is
-        # allowed.
-        if M is None:
-            M = N
-        N_symb = iscalar()
-        M_symb = iscalar()
-        k_symb = iscalar()
-        out = at.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
-        f = aesara.function([N_symb, M_symb, k_symb], out, mode=mode_with_gpu)
-        result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
-        assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
-        assert result.dtype == np.dtype(dtype)
-        assert any(isinstance(node.op, GpuTri) for node in f.maker.fgraph.toposort())
-
-    for dtype in ("float64", "float32", "int32", "float16"):
-        # try a big one
-        check(dtype, 1000, 1000, 0)
-        check(dtype, 1000, 1000, -400)
-        check(dtype, 1000, 1000, 400)
-
-        check(dtype, 5)
-        # M != N, k = 0
-        check(dtype, 3, 5)
-        check(dtype, 5, 3)
-        # N == M, k != 0
-        check(dtype, 3, 3, 1)
-        check(dtype, 3, 3, -1)
-        # N < M, k != 0
-        check(dtype, 3, 5, 1)
-        check(dtype, 3, 5, -1)
-        # N > M, k != 0
-        check(dtype, 5, 3, 1)
-        check(dtype, 5, 3, -1)
-        # k > M, -k > N, k > M, k > N
-        check(dtype, 5, 3, 3)
-        check(dtype, 3, 5, 3)
-        check(dtype, 5, 3, -3)
-        check(dtype, 3, 5, -3)
-        check(dtype, 5, 3, 6)
-        check(dtype, 3, 5, -6)
--- a/tests/gpuarray/test_blas.py
+++ b/tests/gpuarray/test_blas.py
-import itertools
-
-import numpy as np
-
-import aesara
-from aesara.configdefaults import config
-from aesara.gpuarray import gpuarray_shared_constructor
-from aesara.gpuarray.blas import (
-    GpuGemm,
-    GpuGer,
-    gpu_dot22,
-    gpugemm_inplace,
-    gpugemm_no_inplace,
-    gpugemmbatch_inplace,
-    gpugemv_inplace,
-    gpugemv_no_inplace,
-    gpuger_inplace,
-    gpuger_no_inplace,
-)
-from aesara.tensor.blas import (
-    BatchedDot,
-    _dot22,
-    batched_dot,
-    gemm_inplace,
-    gemv,
-    gemv_inplace,
-)
-from aesara.tensor.math import dot
-from aesara.tensor.type import matrix, tensor, tensor3, vector
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-from tests.gpuarray.test_basic_ops import makeTester, rand
-from tests.tensor.test_blas import BaseGemv, TestGer
-
-
-TestGpuGemv = makeTester(
-    "GpuGemvTester",
-    op=gemv_inplace,
-    gpu_op=gpugemv_inplace,
-    # It doesn't support float16
-    cases=dict(
-        dot_vv=[rand(1), 1.0, rand(1, 2), rand(2), 0.0],
-        dot_vm=[rand(3), 1.0, rand(3, 2), rand(2), 0.0],
-        float32=[
-            rand(3).astype("float32"),
-            np.float32(1),
-            rand(3, 2).astype("float32"),
-            rand(2).astype("float32"),
-            np.float32(0),
-        ],
-        float64=[
-            rand(3).astype("float64"),
-            np.float64(1),
-            rand(3, 2).astype("float64"),
-            rand(2).astype("float64"),
-            np.float64(0),
-        ],
-        # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
-        # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
-        # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
-        test_stride=[rand(3)[::-1], 1.0, rand(3, 2)[::-1], rand(2)[::-1], 0.0],
-    ),
-)
-
-
-def test_float16():
-    # gemv (gemm called)
-    float16_data = [
-        rand(3).astype("float16"),
-        np.asarray(1, dtype=np.float32),
-        rand(3, 3).astype("float16"),
-        rand(3).astype("float16"),
-        np.asarray(0.5, dtype=np.float32),
-    ]
-    float16_shared = [
-        gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data
-    ]
-    o = gemv(*float16_shared)
-    f = aesara.function([], o, mode=mode_with_gpu)
-    y, alpha, A, x, beta = float16_data
-    out = f()
-    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
-    topo = f.maker.fgraph.toposort()
-    assert any(isinstance(n.op, GpuGemm) for n in topo)
-
-    # gemm
-    float16_data = [
-        rand(3, 3).astype("float16"),
-        np.asarray(1, dtype=np.float32),
-        rand(3, 3).astype("float16"),
-        rand(3, 3).astype("float16"),
-        np.asarray(0.5, dtype=np.float32),
-    ]
-    float16_shared = [
-        gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data
-    ]
-    o = gpugemm_no_inplace(*float16_shared)
-    f = aesara.function([], o)
-    y, alpha, A, x, beta = float16_data
-    out = f()
-    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
-
-    # dot22
-    float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")]
-
-    float16_shared = [gpuarray_shared_constructor(val) for val in float16_data]
-    o = gpu_dot22(*float16_shared)
-    f = aesara.function([], o)
-    x, y = float16_data
-    out = f()
-    utt.assert_allclose(np.asarray(out), np.dot(x, y))
-
-
-class TestGpuSgemv(BaseGemv, utt.OptimizationTestMixin):
-    mode = mode_with_gpu
-    dtype = "float32"
-
-    gemv = gpugemv_no_inplace
-    gemv_inplace = gpugemv_inplace
-
-    @staticmethod
-    def shared(val):
-        try:
-            return gpuarray_shared_constructor(val)
-        except TypeError:
-            return aesara.shared(val)
-
-
-TestGpuGemm = makeTester(
-    "GpuGemmTester",
-    op=gemm_inplace,
-    gpu_op=gpugemm_inplace,
-    # float16 tested in test_float16
-    cases=dict(
-        test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
-        test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
-        test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
-        test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
-        test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
-        test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
-        test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
-        test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-        float32=[
-            rand(3, 4).astype("float32"),
-            np.float32(-1.0),
-            rand(3, 5).astype("float32"),
-            rand(5, 4).astype("float32"),
-            np.float32(-1.1),
-        ],
-        float64=[
-            rand(3, 4).astype("float64"),
-            np.float64(-1.0),
-            rand(3, 5).astype("float64"),
-            rand(5, 4).astype("float64"),
-            np.float64(-1.1),
-        ],
-        # test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
-        # test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
-        # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
-        # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
-    ),
-)
-
-
-gemm_batched_tests = {
-    "test_b%im%ik%in%i"
-    % (b, m, k, n): [rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()]
-    for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)
-}
-
-gemm_batched_tests["float16"] = [
-    rand(3, 4, 7).astype("float16"),
-    rand().astype("float16"),
-    rand(3, 4, 4).astype("float16"),
-    rand(3, 4, 7).astype("float16"),
-    rand().astype("float16"),
-]
-gemm_batched_tests["float32"] = [
-    rand(3, 4, 7).astype("float32"),
-    rand().astype("float32"),
-    rand(3, 4, 4).astype("float32"),
-    rand(3, 4, 7).astype("float32"),
-    rand().astype("float32"),
-]
-gemm_batched_tests["float64"] = [
-    rand(3, 4, 7).astype("float64"),
-    rand().astype("float64"),
-    rand(3, 4, 4).astype("float64"),
-    rand(3, 4, 7).astype("float64"),
-    rand().astype("float64"),
-]
-
-
-TestGpuGemmBatch = makeTester(
-    "GpuGemmBatchTester",
-    op=lambda z, alpha, x, y, beta: alpha * BatchedDot()(x, y) + beta * z,
-    gpu_op=gpugemmbatch_inplace,
-    cases=gemm_batched_tests,
-)
-
-
-class TestGpuGemmBatchStrided:
-    def test_basic(self):
-        # Reported in https://github.com/Theano/Theano/issues/5730
-        x = tensor3()
-        y = tensor3()
-        z = batched_dot(x, y[:, 0, :, np.newaxis])
-        f = aesara.function([x, y], z, mode=mode_with_gpu)
-        x_num = np.arange(32 * 19 * 600, dtype=config.floatX).reshape((32, 19, 600))
-        y_num = np.arange(7 * 32 * 600, dtype=config.floatX).reshape((32, 7, 600))
-        f(x_num, y_num)
-        assert f.maker.fgraph.toposort()[-2].op.inplace
-
-
-class TestGpuSger(TestGer):
-    def setup_method(self):
-        self.mode = mode_with_gpu
-        dtype = self.dtype = "float32"  # optimization isn't dtype-dependent
-        self.A = tensor(dtype=dtype, broadcastable=(False, False))
-        self.a = tensor(dtype=dtype, broadcastable=())
-        self.x = tensor(dtype=dtype, broadcastable=(False,))
-        self.y = tensor(dtype=dtype, broadcastable=(False,))
-        self.ger_destructive = gpuger_inplace
-
-        # data on the gpu make the op always inplace
-        self.ger = gpuger_inplace
-        self.gemm = gpugemm_inplace
-        super().setup_method()
-
-
-class TestGpuSgerNoTransfer(TestGpuSger):
-    shared = staticmethod(gpuarray_shared_constructor)
-
-
-class TestGpuGer_OpContract(utt.OpContractTestMixin):
-    def setup_method(self):
-        self.ops = [gpuger_no_inplace, gpuger_inplace]
-
-    def clone(self, op):
-        return GpuGer(inplace=op.inplace)
-
-
-TestGpuDot22 = makeTester(
-    "GpuDot22Tester",
-    op=_dot22,
-    gpu_op=gpu_dot22,
-    cases=dict(
-        test1=[rand(3, 4), rand(4, 5)],
-        test2=[rand(1, 4), rand(4, 5)],
-        test3=[rand(3, 1), rand(1, 5)],
-        test4=[rand(3, 4), rand(4, 1)],
-        # test5=[rand(0, 4), rand(4, 5)],
-        # test6=[rand(3, 0), rand(0, 5)],
-        # test7=[rand(3, 4), rand(4, 0)],
-        # test8=[rand(0, 4), rand(4, 0)],
-        # test9=[rand(0, 0), rand(0, 0)],
-    ),
-)
-
-
-def test_gemv_zeros():
-    W = matrix()
-    v = vector()
-    f = aesara.function([W, v], W.dot(v), mode=mode_with_gpu)
-
-    # Apply to an empty matrix shape (5,0) and an empty vector shape (0,)
-    dim = 1000
-    A = np.zeros((dim, 0), dtype=aesara.config.floatX)
-    b = np.zeros((0,), dtype=aesara.config.floatX)
-    tmp = f(A, b)
-    assert np.allclose(tmp, np.zeros((dim,)))
-
-
-def test_gemv_dot_strides():
-    # Reported in https://github.com/Theano/Theano/issues/6142
-    xv = rand(5)
-    yv = rand(5, 1)
-    x = gpuarray_shared_constructor(xv)
-    y = gpuarray_shared_constructor(yv, broadcastable=(False, True))
-    f = aesara.function([], dot(x, y[::-1]), mode=mode_with_gpu)
-    out = f()
-    utt.assert_allclose(out, np.dot(xv, yv[::-1]))
--- a/tests/gpuarray/test_blocksparse.py
+++ b/tests/gpuarray/test_blocksparse.py
-import numpy as np
-import pytest
-
-import aesara
-import tests.unittest_tools as utt
-from aesara.gpuarray.blocksparse import (
-    GpuSparseBlockGemv,
-    GpuSparseBlockOuter,
-    gpu_sparse_block_gemv,
-    gpu_sparse_block_outer,
-)
-from aesara.gpuarray.type import gpuarray_shared_constructor
-from aesara.tensor.type import fmatrix, ftensor3, lmatrix
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-from tests.tensor.nnet.test_blocksparse import TestBlockSparseGemvAndOuter
-
-
-class TestBlockSparseGemvAndOuterGPUarray(TestBlockSparseGemvAndOuter):
-    def setup_method(self):
-        self.mode = mode_with_gpu.excluding("constant_folding")
-        self.gemv_op = gpu_sparse_block_gemv
-        self.outer_op = gpu_sparse_block_outer
-        self.gemv_class = GpuSparseBlockGemv
-        self.outer_class = GpuSparseBlockOuter
-        super().setup_method()
-
-    @pytest.mark.skip(
-        reason="""
-        This test is temporarily disabled since we disabled the output_merge
-        and alpha_merge optimizations for blocksparse due to brokenness.
-        Re-enable when those are re-added.
-        """
-    )
-    def test_blocksparse_grad_merge(self):
-        b = fmatrix()
-        h = ftensor3()
-        iIdx = lmatrix()
-        oIdx = lmatrix()
-
-        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
-        W = gpuarray_shared_constructor(W_val, context=test_ctx_name)
-
-        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
-        gW = aesara.grad(o.sum(), W)
-
-        lr = np.asarray(0.05, dtype="float32")
-
-        upd = W - lr * gW
-
-        f1 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu)
-
-        # Make sure the lr update was merged.
-        assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter)
-
-        # Exclude the merge optimizations.
-        mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha")
-        mode = mode.excluding("local_merge_blocksparse_output")
-
-        f2 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
-
-        # Make sure the lr update is not merged.
-        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter)
-
-        f2(h_val, iIdx_val, b_val, oIdx_val)
-        W_ref = W.get_value()
-
-        # reset the var
-        W.set_value(W_val)
-        f1(h_val, iIdx_val, b_val, oIdx_val)
-        W_opt = W.get_value()
-
-        utt.assert_allclose(W_ref, W_opt)
--- a/tests/gpuarray/test_cgpukernelbase.py
+++ b/tests/gpuarray/test_cgpukernelbase.py
-import numpy as np
-import pytest
-
-import aesara
-from aesara import config
-from aesara import tensor as at
-from aesara.gpuarray.basic_ops import CGpuKernelBase
-from aesara.gpuarray.type import GpuArrayType, get_context, gpu_context_type
-from aesara.gradient import grad_undefined
-from aesara.graph.basic import Apply
-from aesara.link.c.params_type import ParamsType
-from aesara.scalar import int32 as int_t
-
-
-class GpuEye(CGpuKernelBase):
-    """Eye for GPU.
-
-    This is an implementation to test that `CGpuKernelBase` works and also
-    to use as an example in the docs.  It is not used for user graphs.
-
-    """
-
-    __props__ = ("dtype", "context_name")
-    params_type = ParamsType(typecode=int_t, context=gpu_context_type)
-
-    def __init__(self, dtype=None, context_name=None):
-        if dtype is None:
-            dtype = config.floatX
-        self.dtype = dtype
-        self.context_name = context_name
-        super().__init__(["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)")
-
-    def get_params(self, node):
-        pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray")
-
-        return self.params_type.get_params(
-            typecode=pygpu_gpuarray.dtype_to_typecode(self.dtype),
-            context=get_context(self.context_name),
-        )
-
-    def c_headers(self, **kwargs):
-        return ["<gpuarray/types.h>", "<gpuarray/kernel.h>"]
-
-    def make_node(self, n, m):
-        n = at.as_tensor_variable(n)
-        m = at.as_tensor_variable(m)
-        assert n.ndim == 0
-        assert m.ndim == 0
-        otype = GpuArrayType(
-            dtype=self.dtype,
-            broadcastable=(False, False),
-            context_name=self.context_name,
-        )
-
-        return Apply(self, [n, m], [otype()])
-
-    def infer_shape(self, fgraph, node, in_shapes):
-        out_shape = [node.inputs[0], node.inputs[1]]
-        return [out_shape]
-
-    def grad(self, inp, grads):
-        return [grad_undefined(self, i, inp[i]) for i in range(2)]
-
-
-def test_cgpukernelbase():
-    # Import inside the function to prevent the back-end from being
-    # initialized when reloading the GpuEye object from cache.
-    from .config import mode_with_gpu, test_ctx_name
-
-    op = GpuEye(dtype="int32", context_name=test_ctx_name)
-
-    f = aesara.function([], op(4, 5), mode=mode_with_gpu)
-
-    r = f()
-
-    assert r.dtype == "int32"
-    assert (np.asarray(r) == np.eye(4, 5, dtype="int32")).all()
--- a/tests/gpuarray/test_ctc.py
+++ b/tests/gpuarray/test_ctc.py
-import numpy as np
-import pytest
-
-import aesara
-import aesara.gpuarray
-from aesara.gpuarray.ctc import GpuConnectionistTemporalClassification, gpu_ctc
-from aesara.gradient import grad
-from aesara.tensor.math import mean
-from aesara.tensor.nnet.ctc import (
-    ConnectionistTemporalClassification,
-    ctc,
-    ctc_available,
-)
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
-from tests.tensor.nnet.test_ctc import setup_ctc_case, setup_grad_case, setup_torch_case
-
-
-@pytest.mark.skipif(
-    not ctc_available(), reason="Optional library warp-ctc not available"
-)
-class TestCTC:
-    def check_ctc(
-        self, activations, labels, input_length, expected_costs, expected_grads
-    ):
-        # Create symbolic variables
-        t_activations = aesara.shared(activations, name="activations")
-        t_activation_times = aesara.shared(input_length, name="activation_times")
-        t_labels = aesara.shared(labels, name="labels")
-
-        inputs = [t_activations, t_labels, t_activation_times]
-
-        # Execute several tests for each test case
-        self.check_expected_values(
-            t_activations, t_labels, t_activation_times, expected_costs, expected_grads
-        )
-        self.compare_gpu_and_cpu_values(*inputs)
-        self.check_grads_disabled(*inputs)
-        self.run_gpu_optimization_with_grad(*inputs)
-        self.run_gpu_optimization_no_grad(*inputs)
-
-    def setup_cpu_op(
-        self,
-        activations,
-        labels,
-        input_length,
-        compute_grad=True,
-        mode=mode_without_gpu,
-    ):
-        cpu_ctc_cost = ctc(activations, labels, input_length)
-        outputs = [cpu_ctc_cost]
-        if compute_grad:
-            # Symbolic gradient of CTC cost
-            cpu_ctc_grad = grad(mean(cpu_ctc_cost), activations)
-            outputs += [cpu_ctc_grad]
-        return aesara.function([], outputs, mode=mode)
-
-    def setup_gpu_op(self, activations, labels, input_length, compute_grad=True):
-        gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
-        outputs = [gpu_ctc_cost]
-        if compute_grad:
-            # Symbolic gradient of CTC cost
-            gpu_ctc_grad = grad(mean(gpu_ctc_cost), activations)
-            outputs += [gpu_ctc_grad]
-        return aesara.function([], outputs, mode=mode_with_gpu)
-
-    def check_expected_values(
-        self, activations, labels, input_length, expected_costs, expected_grads
-    ):
-        gpu_train = self.setup_gpu_op(activations, labels, input_length)
-        gpu_cost, gpu_grad = gpu_train()
-        # Transfer costs from GPU memory to host
-        cost_from_gpu = np.asarray(gpu_cost)
-        # Transfer gradients from GPU memory to host
-        grad_from_gpu = np.asarray(gpu_grad)
-        # Check that results are in conformance with expected values
-        utt.assert_allclose(expected_grads / cost_from_gpu.shape[0], grad_from_gpu)
-        utt.assert_allclose(expected_costs, cost_from_gpu)
-
-    def compare_gpu_and_cpu_values(self, activations, labels, input_length):
-        cpu_train = self.setup_cpu_op(activations, labels, input_length)
-        cpu_cost, cpu_grad = cpu_train()
-
-        gpu_train = self.setup_gpu_op(activations, labels, input_length)
-        gpu_cost, gpu_grad = gpu_train()
-        # Transfer costs from GPU memory to host
-        cost_from_gpu = np.asarray(gpu_cost)
-        # Transfer gradients from GPU memory to host
-        grad_from_gpu = np.asarray(gpu_grad)
-        # Check that results are in conformance with expected values
-        utt.assert_allclose(cpu_grad, grad_from_gpu)
-        utt.assert_allclose(cpu_cost, cost_from_gpu)
-
-    def check_grads_disabled(self, activations, labels, input_length):
-        """
-        Check if optimization to disable gradients is working
-        """
-        gpu_ctc_cost = gpu_ctc(activations, labels, input_length)
-        gpu_ctc_function = aesara.function([], [gpu_ctc_cost])
-        for node in gpu_ctc_function.maker.fgraph.apply_nodes:
-            if isinstance(node.op, GpuConnectionistTemporalClassification):
-                assert node.op.compute_grad is False
-
-    def run_gpu_optimization_with_grad(self, activations, labels, input_length):
-        # Compile CPU function with optimization
-        cpu_lifted_train = self.setup_cpu_op(
-            activations, labels, input_length, mode=mode_with_gpu
-        )
-        # Check whether Op is lifted to the GPU
-        assert self.has_only_gpu_op(cpu_lifted_train)
-
-    def run_gpu_optimization_no_grad(self, activations, labels, input_length):
-        cpu_train = self.setup_cpu_op(
-            activations, labels, input_length, compute_grad=False
-        )
-        cpu_cost = cpu_train()
-        # Compile CPU function with optimization
-        cpu_lifted_test = self.setup_cpu_op(
-            activations, labels, input_length, compute_grad=False, mode=mode_with_gpu
-        )
-        # Check whether Op is lifted to the GPU
-        assert self.has_only_gpu_op(cpu_lifted_test)
-        gpu_cost = cpu_lifted_test()
-        # Transfer costs from GPU memory to host
-        cost_from_gpu = np.asarray(gpu_cost)
-        # Compare values from CPU and GPU Ops
-        utt.assert_allclose(cpu_cost, cost_from_gpu)
-
-    def has_only_gpu_op(self, function):
-        has_cpu_instance = False
-        has_gpu_instance = False
-        for node in function.maker.fgraph.apply_nodes:
-            if isinstance(node.op, ConnectionistTemporalClassification):
-                has_cpu_instance = True
-
-            if isinstance(node.op, GpuConnectionistTemporalClassification):
-                has_gpu_instance = True
-        return has_gpu_instance and (not has_cpu_instance)
-
-    # Test obtained from Torch tutorial at:
-    # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
-    def test_torch_case(self):
-        (
-            activations,
-            labels,
-            activation_times,
-            expected_costs,
-            expected_grads,
-        ) = setup_torch_case()
-        self.check_ctc(
-            activations, labels, activation_times, expected_costs, expected_grads
-        )
-
-    def test_ctc(self):
-        (
-            activations,
-            labels,
-            input_length,
-            expected_costs,
-            expected_grads,
-        ) = setup_ctc_case()
-        self.check_ctc(
-            activations, labels, input_length, expected_costs, expected_grads
-        )
-
-    def test_verify_grad(self):
-        def ctc_op_functor(labels, in_lengths):
-            def wrapper(acts):
-                # Create auxiliary symbolic variables
-                t_activation_times = aesara.shared(in_lengths, name="activation_times")
-                t_labels = aesara.shared(labels, name="labels")
-                return gpu_ctc(acts, t_labels, t_activation_times)
-
-            return wrapper
-
-        activations, labels, activation_times = setup_grad_case()
-
-        ctc_op = ctc_op_functor(labels, activation_times)
-
-        utt.verify_grad(ctc_op, [activations], mode=mode_with_gpu)
--- a/tests/gpuarray/test_dnn.py
+++ b/tests/gpuarray/test_dnn.py
--- a/tests/gpuarray/test_elemwise.py
+++ b/tests/gpuarray/test_elemwise.py
-import numpy as np
-import pytest
-
-import aesara
-import aesara.scalar as aes
-import aesara.tensor as at
-
-
-pygpu = pytest.importorskip("pygpu")
-gpuarray = pygpu.ndgpuarray
-
-from copy import copy
-
-from aesara.compile.debugmode import DebugMode
-from aesara.compile.mode import Mode
-from aesara.gpuarray.dnn import GpuDnnReduction
-from aesara.gpuarray.elemwise import (
-    GpuCAReduceCPY,
-    GpuCAReduceCuda,
-    GpuDimShuffle,
-    GpuElemwise,
-    GpuErfcinv,
-    GpuErfinv,
-)
-from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
-from aesara.link.basic import PerformLinker
-from aesara.link.c.basic import CLinker
-from aesara.tensor.math import erfcinv, erfinv, mul, tanh
-from aesara.tensor.type import bvector, float_dtypes, fmatrix, fvector, vector
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, test_ctx_name
-from tests.gpuarray.test_basic_ops import rand_gpuarray
-from tests.tensor import test_elemwise
-from tests.unittest_tools import assert_allclose
-
-
-# This is actually a test for GpuElemwise
-class TestGpuBroadcast(test_elemwise.TestBroadcast):
-    cop = GpuElemwise
-    ctype = GpuArrayType
-    # The order is important
-    linkers = [PerformLinker, CLinker]
-
-    def rand_cval(self, shp):
-        return rand_gpuarray(*shp, cls=gpuarray)
-
-
-def test_elemwise_pow():
-    # Test that GpuElemwise(pow) can compile with any combination of integer
-    # or float input dtype.
-    dtypes = [
-        "uint8",
-        "uint16",
-        "uint32",
-        "uint64",
-        "int8",
-        "int16",
-        "int32",
-        "int64",
-        "float16",
-        "float32",
-        "float64",
-    ]
-
-    for dtype_base in dtypes:
-        for dtype_exp in dtypes:
-
-            # Compile a gpu function with the specified dtypes
-            base_val = np.random.randint(0, 5, size=10).astype(dtype_base)
-            exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp)
-
-            base = vector(dtype=dtype_base)
-            exp = gpuarray_shared_constructor(exp_val)
-            assert exp.dtype == dtype_exp
-            output = base**exp
-            f = aesara.function([base], output, mode=mode_with_gpu)
-            # We don't transfer to the GPU when the output dtype is int*
-            n = len(
-                [n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]
-            )
-            assert n == (output.dtype in float_dtypes)
-
-            # Call the function to make sure the output is valid
-            out = f(base_val)
-            expected_out = base_val**exp_val
-            assert_allclose(out, expected_out)
-
-
-class TestMathErrorFunctions:
-    dtypes = ["float64", "float32", "float16"]
-    default_arrays = {}
-    expected_erfinv_outputs = {}
-    expected_erfcinv_outputs = {}
-
-    @classmethod
-    def setup_class(cls):
-        scipy_special = pytest.importorskip("scipy.special")
-
-        # NB: erfinv is defined in ]-1;1[, and erfcinv is defined in ]0;2[,
-        # so we just take some values in an interval that covers both domains
-        # (this will also allow to test some values outside the domains).
-        # We take [-5;5[ by default and we concatenate it 1000 times
-        # to have the GPU ops run on large data.
-        default_array = [x / 10.0 for x in range(-50, 50)] * 1000
-        for dtype in cls.dtypes:
-            numpy_array = np.asarray(default_array, dtype=dtype)
-            cls.default_arrays[dtype] = numpy_array
-            cls.expected_erfinv_outputs[dtype] = scipy_special.erfinv(numpy_array)
-            cls.expected_erfcinv_outputs[dtype] = scipy_special.erfcinv(numpy_array)
-
-        # Since there are infinite values, we need to disable that check
-        # in DebugMode if needed
-        if isinstance(mode_with_gpu, DebugMode):
-            cls.mode_with_gpu = copy(mode_with_gpu)
-            cls.mode_with_gpu.check_isfinite = False
-        else:
-            cls.mode_with_gpu = mode_with_gpu
-        if isinstance(mode_without_gpu, DebugMode):
-            cls.mode_without_gpu = copy(mode_without_gpu)
-            cls.mode_without_gpu.check_isfinite = False
-        else:
-            cls.mode_without_gpu = mode_without_gpu
-
-    def check_gpu_scalar_op(self, aesara_function, scalar_optype):
-        for node in aesara_function.maker.fgraph.apply_nodes:
-            if isinstance(node.op, GpuElemwise) and isinstance(
-                node.op.scalar_op, scalar_optype
-            ):
-                return True
-        aesara.printing.debugprint(aesara_function)
-        return False
-
-    def test_elemwise_erfinv(self):
-        for dtype in self.dtypes:
-            vec = vector(dtype=dtype)
-            output = erfinv(vec)
-            f_host = aesara.function(
-                [vec],
-                output,
-                name="HOST/erfinv/" + dtype,
-                mode=self.mode_without_gpu,
-            )
-            f_gpu = aesara.function(
-                [vec], output, name="GPU/erfinv/" + dtype, mode=self.mode_with_gpu
-            )
-            assert (
-                len(
-                    [
-                        n
-                        for n in f_host.maker.fgraph.apply_nodes
-                        if isinstance(n.op, GpuElemwise)
-                    ]
-                )
-                == 0
-            )
-            if not aesara.config.device.startswith("opencl"):
-                assert self.check_gpu_scalar_op(
-                    f_gpu, GpuErfinv
-                ), 'Function graph does not contains scalar op "GpuErfinv".'
-            vector_val = self.default_arrays[dtype]
-            f_host(vector_val)
-            f_gpu(vector_val)
-            out_host = f_host(vector_val)
-            out_gpu = f_gpu(vector_val)
-            assert_allclose(out_host, out_gpu)
-            assert_allclose(self.expected_erfinv_outputs[dtype], out_gpu)
-
-    def test_elemwise_erfcinv(self):
-        for dtype in self.dtypes:
-            vec = vector(dtype=dtype)
-            output = erfcinv(vec)
-            f_host = aesara.function(
-                [vec],
-                output,
-                name="HOST/erfcinv/" + dtype,
-                mode=self.mode_without_gpu,
-            )
-            f_gpu = aesara.function(
-                [vec], output, name="GPU/erfcinv/" + dtype, mode=self.mode_with_gpu
-            )
-            assert (
-                len(
-                    [
-                        n
-                        for n in f_host.maker.fgraph.apply_nodes
-                        if isinstance(n.op, GpuElemwise)
-                    ]
-                )
-                == 0
-            )
-            if not aesara.config.device.startswith("opencl"):
-                assert self.check_gpu_scalar_op(
-                    f_gpu, GpuErfcinv
-                ), 'Function graph does not contains scalar op "GpuErfcinv".'
-            vector_val = self.default_arrays[dtype]
-            f_host(vector_val)
-            f_gpu(vector_val)
-            out_host = f_host(vector_val)
-            out_gpu = f_gpu(vector_val)
-            assert_allclose(out_host, out_gpu)
-            assert_allclose(self.expected_erfcinv_outputs[dtype], out_gpu)
-
-
-class TestFloat16:
-    def test_composite_elemwise_float16(self):
-        w = bvector()
-        x = vector(dtype="float16")
-        y = fvector()
-
-        cz = tanh(x + at.cast(y, "float16"))
-        o = (
-            cz
-            - cz**2
-            + at.cast(x, "int16")
-            + at.cast(x, "float32")
-            + at.cast(w, "float16")
-            - at.constant(np.float16(1.0))
-        )
-
-        aesara.function([w, x, y], o, mode=mode_with_gpu)
-
-        v = vector(dtype="uint8")
-        w = vector(dtype="float16")
-        x = vector(dtype="float16")
-        y = vector(dtype="float16")
-        z = vector(dtype="float16")
-
-        o = at.switch(v, mul(w, x, y), z)
-        aesara.function([v, w, x, y, z], o, mode=mode_with_gpu)
-
-    def test_cast_float16(self):
-        f16 = vector(dtype="float16")
-        f32 = fvector()
-        i8 = bvector()
-        f = aesara.function(
-            [f16, f32, i8],
-            [
-                f16.astype("float32"),
-                f32.astype("float16"),
-                f32.astype("float64"),
-                f16.astype("int8"),
-                f32.astype("int8"),
-                i8.astype("float16"),
-                i8.astype("float32"),
-            ],
-            mode=mode_with_gpu,
-        )
-
-        d1 = (np.random.rand(4) * 10).astype("float16")
-        d2 = (np.random.rand(5) * 10).astype("float32")
-        d3 = (np.random.rand(6) * 10).astype("int8")
-        res = f(d1, d2, d3)
-
-        for i, out in enumerate(f.outputs):
-            dtype = out.variable.dtype
-            assert res[i].dtype == dtype
-            inp = out.variable.owner.inputs[0]
-            if inp.dtype == "float16":
-                d = d1
-            elif inp.dtype == "float32":
-                d = d2
-            else:
-                d = d3
-            assert_allclose(d.astype(dtype), res[i])
-
-
-class TestGpuDimShuffle(test_elemwise.TestDimShuffle):
-    op = GpuDimShuffle
-
-
-class TestGpuCAReduceCPY(test_elemwise.TestCAReduce):
-    dtypes = ["float32"]
-    bin_dtypes = ["uint8", "int8"]
-    op = GpuCAReduceCPY
-    reds = [aes.add, aes.mul]
-    pre_scalar_op = None
-    mode = mode_with_gpu
-
-    def test_perform(self):
-        for dtype in self.dtypes + self.bin_dtypes:
-            for op in self.reds:
-                self.with_mode(
-                    Mode(linker="py", optimizer=mode_with_gpu.optimizer),
-                    op,
-                    dtype=dtype,
-                    pre_scalar_op=self.pre_scalar_op,
-                )
-
-    def test_perform_nan(self):
-        for dtype in self.dtypes:
-            if not dtype.startswith("float"):
-                continue
-            for op in self.reds:
-                self.with_mode(
-                    Mode(linker="py", optimizer=mode_with_gpu.optimizer),
-                    op,
-                    dtype=dtype,
-                    test_nan=True,
-                    pre_scalar_op=self.pre_scalar_op,
-                )
-
-    def test_c(self):
-        for dtype in self.dtypes + self.bin_dtypes:
-            for op in self.reds:
-                self.with_mode(
-                    Mode(linker="c", optimizer=mode_with_gpu.optimizer),
-                    op,
-                    dtype=dtype,
-                    pre_scalar_op=self.pre_scalar_op,
-                )
-
-    def test_c_nan(self):
-        for dtype in self.dtypes:
-            if not dtype.startswith("float"):
-                continue
-            for op in self.reds:
-                self.with_mode(
-                    Mode(linker="c", optimizer=mode_with_gpu.optimizer),
-                    op,
-                    dtype=dtype,
-                    test_nan=True,
-                    pre_scalar_op=self.pre_scalar_op,
-                )
-
-    def test_infer_shape(self):
-        for dtype in self.dtypes:
-            super().test_infer_shape(dtype)
-
-
-class TestGpuCAReduceCuda(TestGpuCAReduceCPY):
-    dtypes = ["float32", "int64"]
-    bin_dtypes = ["uint8", "int8"]
-
-    cases = [
-        ((5, 6), None),
-        ((5, 6), (0, 1)),
-        ((5, 6), (0,)),
-        ((5, 6), (1,)),
-        ((5, 6), (-1,)),
-        ((5, 6), (-2,)),
-        # ((5, 6), ()),  #reduce on no axis(copy) isn't implemented
-        # ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
-        # ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
-        ((5, 0), None),
-        ((5, 0), (0,)),
-        ((5, 0), (1,)),
-        # ((5, 0), ()), reduce on no axis isn't implemented
-        # ((), None), reduce on no axis isn't implemented
-        # ((), ()) reduce on no axis isn't implemented
-        # Test all GPU cases implemented
-        ((1, 0), (1,)),
-        ((0, 1), (1,)),
-        ((0, 0), (1,)),
-        ((0, 0, 0), (1, 2)),
-        ((0, 0, 0, 0), (1, 2, 3)),
-        ((2, 1), (1,)),
-        ((1, 2), (1,)),
-        ((100, 3, 1300), [1]),
-        ((0,), [0]),
-        ((5,), [0]),
-        ((0, 0), [0, 1]),
-        ((1, 0), [0, 1]),
-        ((5, 4), [0, 1]),
-        ((33, 31), [0, 1]),
-        ((5, 4), [1]),
-        ((5, 4), [0]),  # need something bigger then 32 for some opt test.
-        ((5, 4, 3), [0]),
-        ((5, 4, 3), [1]),
-        ((5, 4, 3), [0, 1]),
-        ((5, 4, 3), [2]),
-        ((5, 4, 3), [1, 2]),
-        ((5, 4, 3), [0, 1, 2]),
-        ((0, 0, 0, 0), [0, 1, 2, 3]),
-        ((5, 4, 3, 20), [2, 3]),
-        ((5, 4, 3, 2), [0, 1, 2, 3]),
-        ((5, 4, 3, 2), [0, 2, 3]),
-        ((5, 4, 3, 2), [1, 2, 3]),
-        # test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
-        ((4100, 3), [0]),
-        ((3, 4101), [0]),  # 10
-        ((1024, 33), [0]),
-        ((33, 1024), [0]),  # 10
-        ((1025, 33), [0]),
-        ((33, 1025), [0]),  # 10
-        ((4100, 3), [1]),
-        ((3, 4101), [1]),  # 01
-        ((1024, 33), [1]),
-        ((33, 1024), [1]),  # 01
-        ((1025, 33), [1]),
-        ((33, 1025), [1]),  # 01
-        ((4100, 3), [0, 1]),
-        ((3, 4101), [0, 1]),  # 11
-        ((1024, 33), [0, 1]),
-        ((33, 1024), [0, 1]),  # 01
-        ((1025, 33), [0, 1]),
-        ((33, 1025), [0, 1]),  # 01
-        ((4100, 4, 3), [0]),
-        ((5, 4100, 3), [0]),
-        ((5, 4, 4100), [0]),
-        ((3, 65536, 1), [0]),  # 100
-        ((4100, 4, 3), [1]),
-        ((5, 4100, 3), [1]),
-        ((5, 4, 4100), [1]),  # 010
-        ((4100, 4, 3), [2]),
-        ((5, 4100, 3), [2]),
-        ((5, 4, 4100), [2]),  # 001
-        ((4100, 4, 3), [0, 1]),
-        ((5, 4100, 3), [0, 1]),
-        ((5, 4, 4100), [0, 1]),  # 110
-        ((4100, 4, 3), [1, 2]),
-        ((5, 4100, 3), [1, 2]),
-        ((5, 4, 4100), [1, 2]),  # 011
-        ((4100, 4, 3), [0, 2]),
-        ((5, 4100, 3), [0, 2]),
-        ((5, 4, 4100), [0, 2]),  # 101
-        ((4100, 4, 3), [0, 1, 2]),
-        ((5, 4100, 3), [0, 1, 2]),
-        ((5, 4, 4100), [0, 1, 2]),  # 111
-        ((65, 4, 3), [0, 1, 2]),
-        ((5, 65, 3), [0, 1, 2]),
-        ((5, 4, 65), [0, 1, 2]),  # 111
-        # reduce over 2d
-        ((4100, 4, 3, 2), [2, 3]),
-        ((4, 4100, 3, 2), [2, 3]),
-        ((4, 3, 4100, 2), [2, 3]),
-        ((4, 3, 2, 4100), [2, 3]),  # 0011
-        ((4100, 4, 3, 2), [1, 3]),
-        ((4, 4100, 3, 2), [1, 3]),
-        ((4, 3, 4100, 2), [1, 3]),
-        ((4, 3, 2, 4100), [1, 3]),  # 0101
-        # ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]),  # 0110 by reshape
-        # ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),  # 1001 by reshape
-        # ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),  # 1010 not implemented
-        # ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]),  # 1100 by reshape
-        # reduce over 3d
-        # 3d not tested: 1101, 1110, 1111
-        # ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),  # 1101 by reshape
-        # ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]),  # 1110 by reshape
-        ((4100, 4, 3, 2), [0, 2, 3]),
-        ((4, 4100, 3, 2), [0, 2, 3]),
-        ((4, 3, 4100, 2), [0, 2, 3]),  # ((4,3,2,4100),[0,2,3]),  # 1011
-        ((4100, 4, 3, 2), [1, 2, 3]),
-        ((4, 4100, 3, 2), [1, 2, 3]),
-        ((4, 3, 4100, 2), [1, 2, 3]),
-        ((4, 3, 2, 4100), [1, 2, 3]),  # 0111
-        ((65, 4, 3, 2), [1, 2, 3]),
-        ((4, 65, 3, 2), [1, 2, 3]),
-        ((4, 3, 65, 2), [1, 2, 3]),
-        ((4, 3, 2, 65), [1, 2, 3]),  # 0111
-        # reduce over 4d
-        ((4100, 2, 3, 4), [0, 1, 2, 3]),
-        ((2, 4100, 3, 4), [0, 1, 2, 3]),
-        ((2, 3, 4100, 4), [0, 1, 2, 3]),
-        ((2, 3, 4, 4100), [0, 1, 2, 3]),
-        ((128, 1, 3, 3), [0, 1, 2, 3]),  # 1111
-        # test pattern implemented by reshape
-        # Skip them as this test the op directly, not the optimization with reshape
-        # ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
-        # ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
-        # ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
-        # ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-        # ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
-        # ((5,4,3,10,11),[1,2]),
-    ]
-    op = GpuCAReduceCuda
-    reds = [aes.add, aes.mul, aes.scalar_maximum, aes.scalar_minimum]
-    pre_scalar_op = None
-
-    def test_perform_noopt(self):
-        return
-
-    def test_perform(self):
-        return
-
-    def test_perform_nan(self):
-        return
-
-    def setup_method(self):
-        super().setup_method()
-        if get_context(test_ctx_name).kind != b"cuda":
-            pytest.skip("Cuda specific tests")
-
-
-class TestGpuReduceDtype(test_elemwise.TestReduceDtype):
-    mode = mode_with_gpu.excluding("local_cut_useless_reduce")
-
-    # GpuDnnReduction doesn't cover all cases, but should cover some
-    op = (GpuCAReduceCuda, GpuDnnReduction)
-    # Currently we don't support reduction on 0 axis
-    axes = [None, 0, 1, 1, [0], [1], [0, 1]]
-    # We don't support complex dtype
-    dtypes = [
-        "int8",
-        "int16",
-        "int32",
-        "int64",
-        "uint8",
-        "uint16",
-        "uint32",
-        "uint64",
-        "float32",
-        "float64",
-    ]
-
-    def setup_method(self):
-        if get_context(test_ctx_name).kind != b"cuda":
-            pytest.skip("Cuda specific tests")
-
-
-def speed_reduce10():
-    data = np.random.rand(1000, 1000).astype("float32")
-    m = fmatrix()
-    f = aesara.function([m], [m.sum(axis=0), m.T.sum(axis=0)], mode=mode_with_gpu)
-    f(data)
--- a/tests/gpuarray/test_extra_ops.py
+++ b/tests/gpuarray/test_extra_ops.py
-from functools import partial
-from itertools import product
-
-import numpy as np
-import pytest
-
-import aesara
-import aesara.tensor.math as tm
-from aesara.gpuarray.extra_ops import GpuCumOp
-from aesara.gpuarray.type import get_context
-from aesara.tensor.extra_ops import CumOp
-from aesara.tensor.type import fmatrix, ftensor3, ftensor4, fvector, tensor3
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-from tests.tensor.test_extra_ops import TestCumOp
-
-
-class TestGpuCumOp(TestCumOp):
-    mode = mode_with_gpu
-
-    def setup_method(self):
-        super().setup_method()
-        test_ctx = get_context(test_ctx_name)
-        if test_ctx.kind != b"cuda":
-            pytest.skip("Cuda specific tests")
-        self.max_threads_dim0 = test_ctx.maxlsize0
-        self.max_grid_size1 = test_ctx.maxgsize2
-        self.op_class = CumOp
-
-        # The CPU implementation is not so accurate, which throws out DebugMode.
-        # Since propagating .tag.values_eq_approx to the output of every
-        # GpuFromHost seems overkill, we just relax the rtol for these tests
-        self.old_rtol = tm.float32_rtol
-        tm.float32_rtol *= 2
-
-    def teardown_method(self):
-        super().teardown_method()
-        # Restore rtol
-        tm.float32_rtol = self.old_rtol
-
-    @pytest.mark.skipif(
-        aesara.config.floatX != "float32",
-        reason=f"Gpucumop not implemented for dtype {aesara.config.floatX}",
-    )
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_infer_shape(self, mode):
-        op_class = partial(self.op_class, mode=mode)
-        x = tensor3("x")
-        a = np.random.random((3, 5, 2)).astype(aesara.config.floatX)
-
-        for axis in range(-len(a.shape), len(a.shape)):
-            self._compile_and_check([x], [op_class(axis=axis)(x)], [a], GpuCumOp)
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_Strides1D(self, mode):
-        op_class = partial(self.op_class, mode=mode)
-        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
-        x = fvector("x")
-
-        for axis in (0, None, -1):
-            a = np.random.random((42,)).astype("float32")
-            cumop_function = aesara.function(
-                [x], op_class(axis=axis)(x), mode=self.mode
-            )
-
-            slicings = [
-                slice(None, None, None),  # Normal strides
-                slice(None, None, 2),  # Stepped strides
-                slice(None, None, -1),  # Negative strides
-            ]
-
-            # Cartesian product of all slicings to test.
-            for slicing in product(slicings, repeat=x.ndim):
-                f = aesara.function(
-                    [x], op_class(axis=axis)(x[slicing]), mode=self.mode
-                )
-                assert [
-                    n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
-                ]
-                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
-                utt.assert_allclose(
-                    np_func(a[slicing], axis=axis), cumop_function(a[slicing])
-                )
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_Strides2D(self, mode):
-        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
-        op_class = partial(self.op_class, mode=mode)
-        x = fmatrix("x")
-
-        for axis in (0, 1, None, -1, -2):
-            a = np.random.random((42, 30)).astype("float32")
-            cumop_function = aesara.function(
-                [x], op_class(axis=axis)(x), mode=self.mode
-            )
-
-            slicings = [
-                slice(None, None, None),  # Normal strides
-                slice(None, None, 2),  # Stepped strides
-                slice(None, None, -1),  # Negative strides
-            ]
-
-            # Cartesian product of all slicings to test.
-            for slicing in product(slicings, repeat=x.ndim):
-                f = aesara.function(
-                    [x], op_class(axis=axis)(x[slicing]), mode=self.mode
-                )
-                assert [
-                    n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
-                ]
-                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
-                utt.assert_allclose(
-                    np_func(a[slicing], axis=axis), cumop_function(a[slicing])
-                )
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_Strides3D(self, mode):
-        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
-        op_class = partial(self.op_class, mode=mode)
-        x = ftensor3("x")
-
-        for axis in (0, 1, 2, None, -1, -2, -3):
-            a = np.random.random((42, 30, 25)).astype("float32")
-            cumop_function = aesara.function(
-                [x], op_class(axis=axis)(x), mode=self.mode
-            )
-
-            slicings = [
-                slice(None, None, None),  # Normal strides
-                slice(None, None, 2),  # Stepped strides
-                slice(None, None, -1),  # Negative strides
-            ]
-
-            # Cartesian product of all slicings to test.
-            for slicing in product(slicings, repeat=x.ndim):
-                f = aesara.function(
-                    [x], op_class(axis=axis)(x[slicing]), mode=self.mode
-                )
-                assert [
-                    n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
-                ]
-                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
-                utt.assert_allclose(
-                    np_func(a[slicing], axis=axis), cumop_function(a[slicing])
-                )
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_GpuCumOp1D(self, mode):
-        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
-        op_class = partial(self.op_class, mode=mode)
-        block_max_size = self.max_threads_dim0 * 2
-
-        x = fvector("x")
-        f = aesara.function([x], op_class(axis=0)(x), mode=self.mode)
-        assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)]
-
-        # Extensive testing for the first 1025 sizes
-        a = np.random.random(1025).astype("float32")
-        for i in range(a.shape[0]):
-            utt.assert_allclose(np_func(a[:i]), f(a[:i]))
-
-        # Use multiple GPU threadblocks
-        a = np.random.random((block_max_size + 2,)).astype("float32")
-        utt.assert_allclose(np_func(a), f(a))
-
-        # Use recursive cumop
-        a = np.ones((block_max_size * (block_max_size + 1) + 2,), dtype="float32")
-        utt.assert_allclose(np_func(a), f(a))
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_GpuCumOp2D(self, mode):
-        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
-        op_class = partial(self.op_class, mode=mode)
-        block_max_size = self.max_threads_dim0 * 2
-
-        x = fmatrix("x")
-        for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
-            f = aesara.function([x], op_class(axis=axis)(x), mode=self.mode)
-            assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)]
-
-            # Extensive testing for the first 1025 sizes
-            a_shape = [5, 5]
-            a_shape[shape_axis] = 1025
-            a = np.random.random(a_shape).astype("float32")
-            slices = [slice(None), slice(None)]
-            for i in range(a.shape[shape_axis]):
-                slices[shape_axis] = slice(i)
-                fa = f(a[slices])
-                npa = np_func(a[slices], axis=axis)
-                utt.assert_allclose(npa, fa)
-
-            # Use multiple GPU threadblocks
-            a_shape = [5, 5]
-            a_shape[shape_axis] = block_max_size + 2
-            a = np.random.random(a_shape).astype("float32")
-            utt.assert_allclose(np_func(a, axis=axis), f(a))
-
-            # Use multiple GPU gridblocks
-            a_shape = [4, 4]
-            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
-            a = np.random.random(a_shape).astype("float32")
-            utt.assert_allclose(np_func(a, axis=axis), f(a), rtol=5e-5)
-
-            # Use recursive cumop
-            a_shape = [3, 3]
-            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
-            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
-            utt.assert_allclose(np_func(a, axis=axis), f(a))
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_GpuCumOp3D(self, mode):
-        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
-        op_class = partial(self.op_class, mode=mode)
-        block_max_size = self.max_threads_dim0 * 2
-
-        x = ftensor3("x")
-        for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]):
-            f = aesara.function([x], op_class(axis=axis)(x), mode=self.mode)
-            assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)]
-
-            # Extensive testing for the first 1025 sizes
-            a_shape = [5, 5, 5]
-            a_shape[shape_axis] = 1025
-            a = np.random.rand(*a_shape).astype("float32")
-            slices = [slice(None), slice(None), slice(None)]
-            for i in range(a.shape[shape_axis]):
-                slices[shape_axis] = slice(i)
-                fa = f(a[slices])
-                npa = np_func(a[slices], axis=axis)
-                utt.assert_allclose(npa, fa)
-
-            # Use multiple GPU threadblocks (along accumulation axis)
-            a_shape = [2, 2, 2]
-            a_shape[shape_axis] = block_max_size + 2
-            a = np.random.random(a_shape).astype("float32")
-            utt.assert_allclose(np_func(a, axis=axis), f(a))
-
-            # Use multiple GPU gridblocks (not along accumulation axis)
-            a_shape = [5, 5, 5]
-            a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
-            a = np.random.random(a_shape).astype("float32")
-            if axis is None:
-                # Avoid floating point error
-                a = np.sign(a - 0.5).astype("float32")
-            utt.assert_allclose(np_func(a, axis=axis), f(a))
-
-            a_shape = [5, 5, 5]
-            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
-            a = np.random.random(a_shape).astype("float32")
-            if axis is None:
-                # Avoid floating point error
-                a = np.sign(a - 0.5).astype("float32")
-            utt.assert_allclose(np_func(a, axis=axis), f(a))
-
-            # Use recursive cumop (along accumulation axis)
-            a_shape = [3, 3, 3]
-            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
-            a = np.random.random(a_shape).astype("float32")
-            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
-            utt.assert_allclose(np_func(a, axis=axis), f(a))
-
-    @pytest.mark.parametrized("mode", ["mul", "add"])
-    def test_GpuCumOp4D(self, mode):
-        op_class = partial(self.op_class, mode=mode)
-        # Should not use the GPU version.
-        x = ftensor4("x")
-        f = aesara.function([x], op_class(axis=1)(x), mode=self.mode)
-        assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, CumOp)]
--- a/tests/gpuarray/test_fft.py
+++ b/tests/gpuarray/test_fft.py
-import numpy as np
-import pytest
-
-import aesara
-import aesara.gpuarray.fft
-from aesara.gpuarray.fft import pycuda_available, pygpu_available, skcuda_available
-from aesara.tensor.type import matrix
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu
-
-
-# Skip tests if pygpu is not available.
-if not pygpu_available:  # noqa
-    pytest.skip("Optional package pygpu not available", allow_module_level=True)
-if not skcuda_available:  # noqa
-    pytest.skip("Optional package scikit-cuda not available", allow_module_level=True)
-if not pycuda_available:  # noqa
-    pytest.skip("Optional package pycuda not available", allow_module_level=True)
-
-# Transform sizes
-N = 32
-
-
-class TestFFT:
-    def test_1Dfft(self):
-        inputs_val = np.random.random((1, N)).astype("float32")
-
-        x = matrix("x", dtype="float32")
-        rfft = aesara.gpuarray.fft.curfft(x)
-        f_rfft = aesara.function([x], rfft, mode=mode_with_gpu)
-        res_rfft = f_rfft(inputs_val)
-        res_rfft_comp = np.asarray(res_rfft[:, :, 0]) + 1j * np.asarray(
-            res_rfft[:, :, 1]
-        )
-
-        rfft_ref = np.fft.rfft(inputs_val, axis=1)
-
-        utt.assert_allclose(rfft_ref, res_rfft_comp)
-
-        m = rfft.type()
-        irfft = aesara.gpuarray.fft.cuirfft(m)
-        f_irfft = aesara.function([m], irfft, mode=mode_with_gpu)
-        res_irfft = f_irfft(res_rfft)
-
-        utt.assert_allclose(inputs_val, np.asarray(res_irfft))
-
-        # The numerical gradient of the FFT is sensitive, must set large
-        # enough epsilon to get good accuracy.
-        eps = 1e-1
-
-        def f_rfft(inp):
-            return aesara.gpuarray.fft.curfft(inp)
-
-        inputs_val = np.random.random((1, N)).astype("float32")
-        utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_irfft(inp):
-            return aesara.gpuarray.fft.cuirfft(inp)
-
-        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype("float32")
-        utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-    def test_rfft(self):
-        inputs_val = np.random.random((1, N, N)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        rfft = aesara.gpuarray.fft.curfft(inputs)
-        f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
-        res_rfft = f_rfft()
-        res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
-            res_rfft[:, :, :, 1]
-        )
-
-        rfft_ref = np.fft.rfftn(inputs_val, axes=(1, 2))
-
-        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
-
-    def test_irfft(self):
-        inputs_val = np.random.random((1, N, N)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        fft = aesara.gpuarray.fft.curfft(inputs)
-        f_fft = aesara.function([], fft, mode=mode_with_gpu)
-        res_fft = f_fft()
-
-        m = fft.type()
-        ifft = aesara.gpuarray.fft.cuirfft(m)
-        f_ifft = aesara.function([m], ifft, mode=mode_with_gpu)
-        res_ifft = f_ifft(res_fft)
-
-        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
-
-        inputs_val = np.random.random((1, N, N, 2)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        irfft = aesara.gpuarray.fft.cuirfft(inputs)
-        f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
-        res_irfft = f_irfft()
-        inputs_ref = inputs_val[..., 0] + inputs_val[..., 1] * 1j
-
-        irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2))
-
-        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
-
-    def test_type(self):
-        inputs_val = np.random.random((1, N)).astype("float64")
-        inputs = aesara.shared(inputs_val)
-
-        with pytest.raises(AssertionError):
-            aesara.gpuarray.fft.curfft(inputs)
-        with pytest.raises(AssertionError):
-            aesara.gpuarray.fft.cuirfft(inputs)
-
-    def test_norm(self):
-        inputs_val = np.random.random((1, N, N)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        # Unitary normalization
-        rfft = aesara.gpuarray.fft.curfft(inputs, norm="ortho")
-        f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
-        res_rfft = f_rfft()
-        res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
-            res_rfft[:, :, :, 1]
-        )
-
-        rfft_ref = np.fft.rfftn(inputs_val, axes=(1, 2))
-
-        utt.assert_allclose(rfft_ref / N, res_rfft_comp, atol=1e-4, rtol=1e-4)
-
-        # No normalization
-        rfft = aesara.gpuarray.fft.curfft(inputs, norm="no_norm")
-        f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
-        res_rfft = f_rfft()
-        res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
-            res_rfft[:, :, :, 1]
-        )
-
-        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
-
-        # Inverse FFT inputs
-        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
-
-        # Unitary normalization inverse FFT
-        irfft = aesara.gpuarray.fft.cuirfft(inputs, norm="ortho")
-        f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
-        res_irfft = f_irfft()
-
-        irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2))
-
-        utt.assert_allclose(irfft_ref * N, res_irfft, atol=1e-4, rtol=1e-4)
-
-        # No normalization inverse FFT
-        irfft = aesara.gpuarray.fft.cuirfft(inputs, norm="no_norm")
-        f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
-        res_irfft = f_irfft()
-
-        utt.assert_allclose(irfft_ref * N**2, res_irfft, atol=1e-4, rtol=1e-4)
-
-    def test_grad(self):
-        # The numerical gradient of the FFT is sensitive, must set large
-        # enough epsilon to get good accuracy.
-        eps = 1e-1
-
-        def f_rfft(inp):
-            return aesara.gpuarray.fft.curfft(inp)
-
-        inputs_val = np.random.random((1, N, N)).astype("float32")
-        utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_irfft(inp):
-            return aesara.gpuarray.fft.cuirfft(inp)
-
-        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype("float32")
-        utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_rfft(inp):
-            return aesara.gpuarray.fft.curfft(inp, norm="ortho")
-
-        inputs_val = np.random.random((1, N, N)).astype("float32")
-        utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_irfft(inp):
-            return aesara.gpuarray.fft.cuirfft(inp, norm="no_norm")
-
-        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype("float32")
-        utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-    def test_odd(self):
-        M = N - 1
-
-        inputs_val = np.random.random((1, M, M)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        rfft = aesara.gpuarray.fft.curfft(inputs)
-        f_rfft = aesara.function([], rfft, mode=mode_with_gpu)
-        res_rfft = f_rfft()
-
-        res_rfft_comp = np.asarray(res_rfft[:, :, :, 0]) + 1j * np.asarray(
-            res_rfft[:, :, :, 1]
-        )
-
-        rfft_ref = np.fft.rfftn(inputs_val, s=(M, M), axes=(1, 2))
-
-        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
-
-        m = rfft.type()
-        ifft = aesara.gpuarray.fft.cuirfft(m, is_odd=True)
-        f_ifft = aesara.function([m], ifft, mode=mode_with_gpu)
-        res_ifft = f_ifft(res_rfft)
-
-        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
-
-        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        irfft = aesara.gpuarray.fft.cuirfft(inputs, norm="ortho", is_odd=True)
-        f_irfft = aesara.function([], irfft, mode=mode_with_gpu)
-        res_irfft = f_irfft()
-
-        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
-        irfft_ref = np.fft.irfftn(inputs_ref, s=(M, M), axes=(1, 2)) * M
-
-        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
-
-        # The numerical gradient of the FFT is sensitive, must set large
-        # enough epsilon to get good accuracy.
-        eps = 1e-1
-
-        def f_rfft(inp):
-            return aesara.gpuarray.fft.curfft(inp)
-
-        inputs_val = np.random.random((1, M, M)).astype("float32")
-        utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_irfft(inp):
-            return aesara.gpuarray.fft.cuirfft(inp, is_odd=True)
-
-        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32")
-        utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_rfft(inp):
-            return aesara.gpuarray.fft.curfft(inp, norm="ortho")
-
-        inputs_val = np.random.random((1, M, M)).astype("float32")
-        utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-        def f_irfft(inp):
-            return aesara.gpuarray.fft.cuirfft(inp, norm="no_norm", is_odd=True)
-
-        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32")
-        utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
-
-    def test_params(self):
-        inputs_val = np.random.random((1, N)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        with pytest.raises(ValueError):
-            aesara.gpuarray.fft.curfft(inputs, norm=123)
-
-        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype("float32")
-        inputs = aesara.shared(inputs_val)
-
-        with pytest.raises(ValueError):
-            aesara.gpuarray.fft.cuirfft(inputs, norm=123)
-        with pytest.raises(ValueError):
-            aesara.gpuarray.fft.cuirfft(inputs, is_odd=123)
--- a/tests/gpuarray/test_gemmcorr.py
+++ b/tests/gpuarray/test_gemmcorr.py
-import numpy as np
-
-import aesara
-from aesara.configdefaults import config
-from aesara.gpuarray.blas import GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights
-from aesara.gpuarray.type import gpuarray_shared_constructor
-from aesara.tensor.nnet.corr import CorrMM, CorrMM_gradInputs, CorrMM_gradWeights
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, ref_cast
-from tests.tensor.nnet.test_abstract_conv import (
-    TestAsymmetricPadding,
-    TestCausalConv,
-    TestGroupedConvNoOptim,
-    TestUnsharedConv,
-)
-
-
-class TestCorrMM:
-    def run_conv_valid(
-        self,
-        inputs_shape,
-        filters_shape,
-        border_mode="valid",
-        filter_dilation=(1, 1),
-        subsample=(1, 1),
-        unshared=False,
-        verify_grad=False,
-    ):
-        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        if unshared:
-            filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
-        else:
-            filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
-
-        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
-        filters_val = np.random.random(filters_shape).astype(config.floatX)
-
-        inputs = gpuarray_shared_constructor(inputs_val)
-        filters = gpuarray_shared_constructor(filters_val)
-
-        conv_ref = CorrMM(
-            border_mode=border_mode,
-            filter_dilation=filter_dilation,
-            subsample=subsample,
-            unshared=unshared,
-        )(ref_cast(inputs), ref_cast(filters))
-        f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
-
-        conv = GpuCorrMM(
-            border_mode=border_mode,
-            filter_dilation=filter_dilation,
-            subsample=subsample,
-            unshared=unshared,
-        )(inputs, filters)
-        f = aesara.function([], conv, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-        if verify_grad:
-            utt.verify_grad(
-                GpuCorrMM(
-                    border_mode=border_mode,
-                    filter_dilation=filter_dilation,
-                    subsample=subsample,
-                    unshared=unshared,
-                ),
-                [inputs_val, filters_val],
-                mode=mode_with_gpu,
-            )
-
-    def test_valid(self):
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1))
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(2, 2)
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(3, 3)
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(3, 2)
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1), filters_shape=(10, 6, 12, 1), subsample=(1, 2)
-        )
-
-    def test_border_mode(self):
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            border_mode="valid",
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            border_mode="half",
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            border_mode="full",
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            border_mode=(0, 0),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            border_mode=(1, 2),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            border_mode=(3, 2),
-        )
-
-    def test_filter_dilation(self):
-        inputs_shape = [16, 20, 12, 1]
-        filters_shape = [10, 6, 5, 1]
-
-        for filter_dilation in [(2, 1), (1, 2)]:
-            for border_mode in ["valid", "half", "full"]:
-                self.run_conv_valid(
-                    inputs_shape=inputs_shape,
-                    filters_shape=filters_shape,
-                    filter_dilation=filter_dilation,
-                    border_mode=border_mode,
-                )
-
-    def test_verify_gradients(self):
-        # use a small example to check the gradients
-        inputs_shape = [2, 7, 9, 1]
-        filters_shape = [1, 3, 3, 1]
-
-        for filter_dilation in [(2, 1), (1, 2)]:
-            for border_mode in ["valid", "half", "full", (2, 1)]:
-                self.run_conv_valid(
-                    inputs_shape=inputs_shape,
-                    filters_shape=filters_shape,
-                    filter_dilation=filter_dilation,
-                    border_mode=border_mode,
-                    verify_grad=True,
-                )
-
-    def test_unshared(self):
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 15, 1, 6, 12, 1),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 8, 1, 6, 12, 1),
-            subsample=(2, 2),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 5, 1, 6, 12, 1),
-            subsample=(3, 3),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 5, 1, 6, 12, 1),
-            subsample=(3, 2),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 15, 1, 6, 12, 1),
-            subsample=(1, 2),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 15, 1, 6, 12, 1),
-            border_mode="valid",
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 21, 13, 6, 12, 1),
-            border_mode="half",
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 25, 23, 6, 12, 1),
-            border_mode="full",
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 15, 1, 6, 12, 1),
-            border_mode=(0, 0),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 17, 5, 6, 12, 1),
-            border_mode=(1, 2),
-            unshared=True,
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 21, 5, 6, 12, 1),
-            border_mode=(3, 2),
-            unshared=True,
-        )
-
-    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1)):
-        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
-        dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
-
-        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
-        dCdH_val = np.random.random(dCdH_shape).astype(config.floatX)
-        inputs = gpuarray_shared_constructor(inputs_val)
-        dCdH = gpuarray_shared_constructor(dCdH_val)
-        shape = gpuarray_shared_constructor(np.array(filters_shape[2:]))
-
-        if subsample == (1, 1):
-            conv_ref = CorrMM_gradWeights(subsample=subsample)(
-                ref_cast(inputs), ref_cast(dCdH)
-            )
-            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(inputs, dCdH)
-        else:
-            conv_ref = CorrMM_gradWeights(subsample=subsample)(
-                ref_cast(inputs), ref_cast(dCdH), shape=shape
-            )
-            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
-                inputs, dCdH, shape=shape
-            )
-
-        f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
-        f = aesara.function([], conv_gemm, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_gradweight(self):
-        self.run_gradweight(
-            inputs_shape=(16, 10, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            dCdH_shape=(16, 5, 1, 10),
-            subsample=(1, 1),
-        )
-        self.run_gradweight(
-            inputs_shape=(16, 20, 10, 1),
-            filters_shape=(10, 6, 4, 1),
-            dCdH_shape=(16, 8, 4, 10),
-            subsample=(2, 2),
-        )
-        self.run_gradweight(
-            inputs_shape=(16, 20, 10, 1),
-            filters_shape=(10, 6, 3, 1),
-            dCdH_shape=(16, 5, 3, 10),
-            subsample=(3, 3),
-        )
-        self.run_gradweight(
-            inputs_shape=(16, 20, 12, 1),
-            filters_shape=(10, 6, 12, 1),
-            dCdH_shape=(16, 8, 1, 10),
-            subsample=(2, 1),
-        )
-
-    def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1)):
-        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
-
-        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
-        filters_val = np.random.random(filters_shape).astype(config.floatX)
-        inputs = gpuarray_shared_constructor(inputs_val)
-        filters = gpuarray_shared_constructor(filters_val)
-
-        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
-        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
-        bottom_shape = gpuarray_shared_constructor(
-            np.array([bottom_height, bottom_width])
-        )
-
-        if subsample == (1, 1):
-            conv_ref = CorrMM_gradInputs(subsample=subsample)(
-                kern=ref_cast(filters), topgrad=ref_cast(inputs)
-            )
-            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs
-            )
-        else:
-            conv_ref = CorrMM_gradInputs(subsample=subsample)(
-                kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape
-            )
-            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs, shape=bottom_shape
-            )
-
-        f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
-        f = aesara.function([], conv_gemm, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_gradinput(self):
-        self.run_gradinput(inputs_shape=(16, 15, 12, 10), filters_shape=(10, 6, 12, 1))
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 10),
-            filters_shape=(10, 6, 12, 1),
-            subsample=(2, 2),
-        )
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 10),
-            filters_shape=(10, 6, 12, 1),
-            subsample=(3, 3),
-        )
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 10),
-            filters_shape=(10, 6, 12, 1),
-            subsample=(3, 1),
-        )
-
-    def test_large_input(self):
-        # This tests the number-of-threads computation
-        # by making (channels * height) > (max_threads_dim ** 2).
-        # (See also issue #5165.)
-        self.run_conv_valid(
-            inputs_shape=(1, 1024, 3, 1024),
-            filters_shape=(1, 1, 1, 1024),
-            verify_grad=False,
-        )
-        self.run_gradinput(inputs_shape=(1, 1024, 3, 1), filters_shape=(1, 1, 1, 1024))
-
-
-class TestGroupGpuCorr2d(TestGroupedConvNoOptim):
-    mode = mode_with_gpu.excluding("cudnn")
-    conv_op = GpuCorrMM
-    conv_gradw_op = GpuCorrMM_gradWeights
-    conv_gradi_op = GpuCorrMM_gradInputs
-    flip_filter = True
-    is_dnn = False
-
-
-class TestUnsharedGpuCorr2d(TestUnsharedConv):
-    mode = mode_with_gpu
-    conv2d_op = GpuCorrMM
-    conv2d_gradw_op = GpuCorrMM_gradWeights
-    conv2d_gradi_op = GpuCorrMM_gradInputs
-
-
-class TestAsymmetricGpu(TestAsymmetricPadding):
-    mode = mode_with_gpu
-    conv2d_op = GpuCorrMM
-    conv2d_gradw_op = GpuCorrMM_gradWeights
-    conv2d_gradi_op = GpuCorrMM_gradInputs
-
-
-class TestCausalGpuCorr(TestCausalConv):
-    mode = mode_with_gpu
--- a/tests/gpuarray/test_gemmcorr3d.py
+++ b/tests/gpuarray/test_gemmcorr3d.py
-import numpy as np
-
-import aesara
-from aesara.configdefaults import config
-from aesara.gpuarray.blas import (
-    GpuCorr3dMM,
-    GpuCorr3dMM_gradInputs,
-    GpuCorr3dMM_gradWeights,
-)
-from aesara.gpuarray.type import gpuarray_shared_constructor
-from aesara.tensor.nnet.corr3d import Corr3dMM, Corr3dMMGradInputs, Corr3dMMGradWeights
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, ref_cast
-from tests.tensor.nnet.test_abstract_conv import TestGroupedConv3dNoOptim
-
-
-class TestCorr3dMM:
-    def run_conv_valid(
-        self,
-        inputs_shape,
-        filters_shape,
-        border_mode="valid",
-        filter_dilation=(1, 1, 1),
-        subsample=(1, 1, 1),
-        verify_grad=False,
-    ):
-        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
-        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
-
-        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
-        filters_val = np.random.random(filters_shape).astype(config.floatX)
-
-        inputs = gpuarray_shared_constructor(inputs_val)
-        filters = gpuarray_shared_constructor(filters_val)
-
-        conv_ref = Corr3dMM(
-            border_mode=border_mode,
-            filter_dilation=filter_dilation,
-            subsample=subsample,
-        )(ref_cast(inputs), ref_cast(filters))
-        f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
-
-        conv = GpuCorr3dMM(
-            border_mode=border_mode,
-            filter_dilation=filter_dilation,
-            subsample=subsample,
-        )(inputs, filters)
-        f = aesara.function([], conv, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-        if verify_grad:
-            utt.verify_grad(
-                GpuCorr3dMM(
-                    border_mode=border_mode,
-                    filter_dilation=filter_dilation,
-                    subsample=subsample,
-                ),
-                [inputs_val, filters_val],
-                mode=mode_with_gpu,
-            )
-
-    def test_valid(self):
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 16, 1), filters_shape=(10, 6, 12, 4, 1)
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(2, 2, 2),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(2, 2, 2),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(3, 3, 3),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(3, 3, 3),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(3, 2, 1),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(1, 2, 3),
-        )
-
-    def test_border_mode(self):
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            border_mode="valid",
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            border_mode="half",
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            border_mode="full",
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            border_mode=(0, 0, 0),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            border_mode=(1, 2, 3),
-        )
-        self.run_conv_valid(
-            inputs_shape=(16, 20, 12, 15, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            border_mode=(3, 2, 1),
-        )
-
-    def test_filter_dilation(self):
-        inputs_shape = [16, 20, 12, 15, 1]
-        filters_shape = [10, 6, 5, 4, 1]
-
-        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
-            for border_mode in ["valid", "half", "full"]:
-                self.run_conv_valid(
-                    inputs_shape=inputs_shape,
-                    filters_shape=filters_shape,
-                    filter_dilation=filter_dilation,
-                    border_mode=border_mode,
-                )
-
-    def test_verify_gradients(self):
-        # use a small example to check the gradients
-        inputs_shape = [2, 7, 9, 6, 1]
-        filters_shape = [1, 3, 3, 2, 1]
-
-        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
-            for border_mode in ["valid", "half", "full", (2, 1, 3)]:
-                self.run_conv_valid(
-                    inputs_shape=inputs_shape,
-                    filters_shape=filters_shape,
-                    filter_dilation=filter_dilation,
-                    border_mode=border_mode,
-                    verify_grad=True,
-                )
-
-    def run_gradweight(
-        self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1, 1)
-    ):
-        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
-        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
-        dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
-
-        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
-        dCdH_val = np.random.random(dCdH_shape).astype(config.floatX)
-        inputs = gpuarray_shared_constructor(inputs_val)
-        dCdH = gpuarray_shared_constructor(dCdH_val)
-        shape = gpuarray_shared_constructor(np.array(filters_shape[2:]))
-
-        if subsample == (1, 1, 1):
-            conv_ref = Corr3dMMGradWeights(subsample=subsample)(
-                ref_cast(inputs), ref_cast(dCdH)
-            )
-            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(inputs, dCdH)
-        else:
-            conv_ref = Corr3dMMGradWeights(subsample=subsample)(
-                ref_cast(inputs), ref_cast(dCdH), shape=shape
-            )
-            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
-                inputs, dCdH, shape=shape
-            )
-
-        f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
-        f = aesara.function([], conv_gemm, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_gradweight(self):
-        self.run_gradweight(
-            inputs_shape=(16, 10, 12, 16, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            dCdH_shape=(16, 5, 1, 13, 10),
-            subsample=(1, 1, 1),
-        )
-        self.run_gradweight(
-            inputs_shape=(16, 20, 10, 16, 1),
-            filters_shape=(10, 6, 4, 4, 1),
-            dCdH_shape=(16, 8, 4, 7, 10),
-            subsample=(2, 2, 2),
-        )
-        self.run_gradweight(
-            inputs_shape=(16, 20, 10, 16, 1),
-            filters_shape=(10, 6, 3, 4, 1),
-            dCdH_shape=(16, 5, 3, 5, 10),
-            subsample=(3, 3, 3),
-        )
-        self.run_gradweight(
-            inputs_shape=(16, 20, 12, 16, 1),
-            filters_shape=(10, 6, 12, 4, 1),
-            dCdH_shape=(16, 8, 1, 5, 10),
-            subsample=(2, 1, 3),
-        )
-
-    def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)):
-        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
-        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
-
-        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
-        filters_val = np.random.random(filters_shape).astype(config.floatX)
-        inputs = gpuarray_shared_constructor(inputs_val)
-        filters = gpuarray_shared_constructor(filters_val)
-
-        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
-        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
-        bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4]
-        bottom_shape = gpuarray_shared_constructor(
-            np.array([bottom_height, bottom_width, bottom_depth])
-        )
-
-        if subsample == (1, 1, 1):
-            conv_ref = Corr3dMMGradInputs(subsample=subsample)(
-                kern=ref_cast(filters), topgrad=ref_cast(inputs)
-            )
-            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs
-            )
-        else:
-            conv_ref = Corr3dMMGradInputs(subsample=subsample)(
-                kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape
-            )
-            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs, shape=bottom_shape
-            )
-
-        f_ref = aesara.function([], conv_ref, mode=mode_without_gpu)
-        f = aesara.function([], conv_gemm, mode=mode_with_gpu)
-
-        res_ref = f_ref()
-        res = f()
-        utt.assert_allclose(res_ref, res)
-
-    def test_gradinput(self):
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 12, 10), filters_shape=(10, 6, 12, 4, 1)
-        )
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 12, 10),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(2, 2, 2),
-        )
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 12, 10),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(3, 3, 3),
-        )
-        self.run_gradinput(
-            inputs_shape=(16, 15, 12, 12, 10),
-            filters_shape=(10, 6, 12, 4, 1),
-            subsample=(3, 1, 2),
-        )
-
-    def test_large_input(self):
-        # This tests the number-of-threads computation
-        # by making (channels * height) > (max_threads_dim ** 2).
-        # (See also issue #5165.)
-        self.run_conv_valid(
-            inputs_shape=(1, 1024, 3, 3, 1024),
-            filters_shape=(1, 1, 1, 1, 1024),
-            verify_grad=False,
-        )
-        self.run_gradinput(
-            inputs_shape=(1, 1024, 3, 3, 1), filters_shape=(1, 1, 1, 1, 1024)
-        )
-
-
-class TestGroupGpuCorr3d(TestGroupedConv3dNoOptim):
-    mode = mode_with_gpu.excluding("cudnn")
-    conv_op = GpuCorr3dMM
-    conv_gradw_op = GpuCorr3dMM_gradWeights
-    conv_gradi_op = GpuCorr3dMM_gradInputs
--- a/tests/gpuarray/test_linalg.py
+++ b/tests/gpuarray/test_linalg.py
-import numpy as np
-import pytest
-from numpy.linalg.linalg import LinAlgError
-
-import aesara
-from aesara.configdefaults import config
-from aesara.gpuarray import gpuarray_shared_constructor
-from aesara.gpuarray.linalg import (
-    GpuCholesky,
-    GpuCublasTriangularSolve,
-    GpuCusolverSolve,
-    GpuMagmaCholesky,
-    GpuMagmaEigh,
-    GpuMagmaMatrixInverse,
-    GpuMagmaQR,
-    GpuMagmaSVD,
-    cusolver_available,
-    gpu_cholesky,
-    gpu_matrix_inverse,
-    gpu_qr,
-    gpu_solve,
-    gpu_solve_lower_triangular,
-    gpu_svd,
-)
-from aesara.tensor.nlinalg import SVD, MatrixInverse, QRFull, eigh, matrix_inverse, qr
-from aesara.tensor.slinalg import Cholesky, cholesky
-from aesara.tensor.type import fmatrix, matrix, tensor3, vector
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
-from tests.gpuarray.test_basic_ops import random
-
-
-@pytest.mark.skipif(
-    not cusolver_available,
-    reason="Optional package scikits.cuda.cusolver not available",
-)
-class TestCusolver:
-    def run_gpu_solve(self, A_val, x_val, A_struct=None):
-        b_val = np.dot(A_val, x_val)
-        b_val_trans = np.dot(A_val.T, x_val)
-
-        A = matrix("A", dtype="float32")
-        b = matrix("b", dtype="float32")
-        b_trans = matrix("b", dtype="float32")
-
-        if A_struct is None:
-            solver = gpu_solve(A, b)
-            solver_trans = gpu_solve(A, b_trans, trans="T")
-        else:
-            solver = gpu_solve(A, b, A_struct)
-            solver_trans = gpu_solve(A, b_trans, A_struct, trans="T")
-
-        fn = aesara.function(
-            [A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu
-        )
-        res = fn(A_val, b_val, b_val_trans)
-        x_res = np.array(res[0])
-        x_res_trans = np.array(res[1])
-        utt.assert_allclose(x_val, x_res)
-        utt.assert_allclose(x_val, x_res_trans)
-
-    def test_diag_solve(self):
-        np.random.seed(1)
-        A_val = np.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]], dtype="float32")
-        x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
-        self.run_gpu_solve(A_val, x_val)
-
-    def test_bshape_solve(self):
-        # Test when shape of b (k, m) is such as m > k
-
-        np.random.seed(1)
-        A_val = np.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]], dtype="float32")
-        x_val = np.random.uniform(
-            -0.4, 0.4, (A_val.shape[1], A_val.shape[1] + 1)
-        ).astype("float32")
-        self.run_gpu_solve(A_val, x_val)
-
-    def test_sym_solve(self):
-        np.random.seed(1)
-        A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        A_sym = np.dot(A_val, A_val.T)
-        x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
-        self.run_gpu_solve(A_sym, x_val, "symmetric")
-
-    def test_orth_solve(self):
-        np.random.seed(1)
-        A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        A_orth = np.linalg.svd(A_val)[0]
-        x_val = np.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32")
-        self.run_gpu_solve(A_orth, x_val)
-
-    def test_uni_rand_solve(self):
-        np.random.seed(1)
-        A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
-        self.run_gpu_solve(A_val, x_val)
-
-    def test_linalgerrsym_solve(self):
-        np.random.seed(1)
-        A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
-        A_val = np.dot(A_val.T, A_val)
-        # make A singular
-        A_val[:, 2] = A_val[:, 1] + A_val[:, 3]
-
-        A = matrix("A", dtype="float32")
-        b = matrix("b", dtype="float32")
-        solver = gpu_solve(A, b, "symmetric")
-
-        fn = aesara.function([A, b], [solver], mode=mode_with_gpu)
-        with pytest.raises(LinAlgError):
-            fn(A_val, x_val)
-
-    def test_linalgerr_solve(self):
-        np.random.seed(1)
-        A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        x_val = np.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
-        # make A singular
-        A_val[:, 2] = 0
-
-        A = matrix("A", dtype="float32")
-        b = matrix("b", dtype="float32")
-        solver = gpu_solve(A, b, trans="T")
-
-        fn = aesara.function([A, b], [solver], mode=mode_with_gpu)
-        with pytest.raises(LinAlgError):
-            fn(A_val, x_val)
-
-    def verify_solve_grad(self, m, n, A_structure, lower, rng):
-        # ensure diagonal elements of A relatively large to avoid numerical
-        # precision issues
-        A_val = (rng.normal(size=(m, m)) * 0.5 + np.eye(m)).astype(config.floatX)
-        if A_structure == "lower_triangular":
-            A_val = np.tril(A_val)
-        elif A_structure == "upper_triangular":
-            A_val = np.triu(A_val)
-        if n is None:
-            b_val = rng.normal(size=m).astype(config.floatX)
-        else:
-            b_val = rng.normal(size=(m, n)).astype(config.floatX)
-        eps = None
-        if config.floatX == "float64":
-            eps = 2e-8
-
-        if A_structure in ("lower_triangular", "upper_triangular"):
-            solve_op = GpuCublasTriangularSolve(lower=lower)
-        else:
-            solve_op = GpuCusolverSolve(A_structure="general")
-        utt.verify_grad(solve_op, [A_val, b_val], 3, rng, eps=eps)
-
-    def test_solve_grad(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        structures = ["general", "lower_triangular", "upper_triangular"]
-        for A_structure in structures:
-            lower = A_structure == "lower_triangular"
-            # self.verify_solve_grad(5, None, A_structure, lower, rng)
-            self.verify_solve_grad(6, 1, A_structure, lower, rng)
-            self.verify_solve_grad(4, 3, A_structure, lower, rng)
-        # lower should have no effect for A_structure == 'general' so also
-        # check lower=True case
-        self.verify_solve_grad(4, 3, "general", lower=True, rng=rng)
-
-
-@pytest.mark.skipif(
-    not cusolver_available,
-    reason="Optional package scikits.cuda.cusolver not available",
-)
-class TestGpuCholesky:
-    def get_gpu_cholesky_func(self, lower=True, inplace=False):
-        # Helper function to compile function from GPU Cholesky op.
-        A = matrix("A", dtype="float32")
-        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
-        chol_A = cholesky_op(A)
-        return aesara.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
-
-    def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
-        # Helper function to compare op output to np.cholesky output.
-        chol_A_val = np.linalg.cholesky(A_val)
-        if not lower:
-            chol_A_val = chol_A_val.T
-        fn = self.get_gpu_cholesky_func(lower, inplace)
-        res = fn(A_val)
-        chol_A_res = np.array(res)
-        utt.assert_allclose(chol_A_res, chol_A_val)
-
-    def test_gpu_cholesky_opt(self):
-        A = matrix("A", dtype="float32")
-        fn = aesara.function([A], cholesky(A), mode=mode_with_gpu)
-        assert any(
-            [isinstance(node.op, GpuCholesky) for node in fn.maker.fgraph.toposort()]
-        )
-
-    def test_invalid_input_fail_non_square(self):
-        # Invalid Cholesky input test with non-square matrix as input.
-        A_val = np.random.normal(size=(3, 2)).astype("float32")
-        fn = self.get_gpu_cholesky_func(True, False)
-        with pytest.raises(ValueError):
-            fn(A_val)
-
-    def test_invalid_input_fail_vector(self):
-        # Invalid Cholesky input test with vector as input.
-        def invalid_input_func():
-            A = vector("A", dtype="float32")
-            GpuCholesky(lower=True, inplace=False)(A)
-
-        with pytest.raises(AssertionError):
-            invalid_input_func()
-
-    def test_invalid_input_fail_tensor3(self):
-        # Invalid Cholesky input test with 3D tensor as input.
-        def invalid_input_func():
-            A = tensor3("A", dtype="float32")
-            GpuCholesky(lower=True, inplace=False)(A)
-
-        with pytest.raises(AssertionError):
-            invalid_input_func()
-
-    @utt.assertFailure_fast
-    def test_diag_chol(self):
-        # Diagonal matrix input Cholesky test.
-        for lower in [True, False]:
-            for inplace in [True, False]:
-                # make sure all diagonal elements are positive so positive-definite
-                A_val = np.diag(np.random.uniform(size=5).astype("float32") + 1)
-                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
-
-    @utt.assertFailure_fast
-    def test_dense_chol_lower(self):
-        # Dense matrix input lower-triangular Cholesky test.
-        for lower in [True, False]:
-            for inplace in [True, False]:
-                M_val = np.random.normal(size=(3, 3)).astype("float32")
-                # A = M.dot(M) will be positive definite for all non-singular M
-                A_val = M_val.dot(M_val.T)
-                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
-
-    def test_invalid_input_fail_non_symmetric(self):
-        # Invalid Cholesky input test with non-symmetric input.
-        #    (Non-symmetric real input must also be non-positive definite).
-        A_val = None
-        while True:
-            A_val = np.random.normal(size=(3, 3)).astype("float32")
-            if not np.allclose(A_val, A_val.T):
-                break
-        fn = self.get_gpu_cholesky_func(True, False)
-        with pytest.raises(LinAlgError):
-            fn(A_val)
-
-    def test_invalid_input_fail_negative_definite(self):
-        # Invalid Cholesky input test with negative-definite input.
-        M_val = np.random.normal(size=(3, 3)).astype("float32")
-        # A = -M.dot(M) will be negative definite for all non-singular M
-        A_val = -M_val.dot(M_val.T)
-        fn = self.get_gpu_cholesky_func(True, False)
-        with pytest.raises(LinAlgError):
-            fn(A_val)
-
-
-@pytest.mark.skipif(
-    not cusolver_available,
-    reason="Optional package scikits.cuda.cusolver not available",
-)
-class TestGpuCholesky64:
-    def get_gpu_cholesky_func(self, lower=True, inplace=False):
-        # Helper function to compile function from GPU Cholesky op.
-        A = matrix("A", dtype="float64")
-        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
-        chol_A = cholesky_op(A)
-        return aesara.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
-
-    def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
-        # Helper function to compare op output to np.cholesky output.
-        chol_A_val = np.linalg.cholesky(A_val)
-        if not lower:
-            chol_A_val = chol_A_val.T
-        fn = self.get_gpu_cholesky_func(lower, inplace)
-        res = fn(A_val)
-        chol_A_res = np.array(res)
-        utt.assert_allclose(chol_A_res, chol_A_val)
-
-    def test_gpu_cholesky_opt(self):
-        A = matrix("A", dtype="float64")
-        fn = aesara.function([A], cholesky(A), mode=mode_with_gpu)
-        assert any(
-            [isinstance(node.op, GpuCholesky) for node in fn.maker.fgraph.toposort()]
-        )
-
-    def test_invalid_input_fail_non_square(self):
-        # Invalid Cholesky input test with non-square matrix as input.
-        A_val = np.random.normal(size=(3, 2)).astype("float64")
-        fn = self.get_gpu_cholesky_func(True, False)
-        with pytest.raises(ValueError):
-            fn(A_val)
-
-    def test_invalid_input_fail_vector(self):
-        # Invalid Cholesky input test with vector as input.
-        def invalid_input_func():
-            A = vector("A", dtype="float64")
-            GpuCholesky(lower=True, inplace=False)(A)
-
-        with pytest.raises(AssertionError):
-            invalid_input_func()
-
-    def test_invalid_input_fail_tensor3(self):
-        # Invalid Cholesky input test with 3D tensor as input.
-        def invalid_input_func():
-            A = tensor3("A", dtype="float64")
-            GpuCholesky(lower=True, inplace=False)(A)
-
-        with pytest.raises(AssertionError):
-            invalid_input_func()
-
-    @utt.assertFailure_fast
-    def test_diag_chol(self):
-        # Diagonal matrix input Cholesky test.
-        for lower in [True, False]:
-            for inplace in [True, False]:
-                # make sure all diagonal elements are positive so positive-definite
-                A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
-                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
-
-    @utt.assertFailure_fast
-    def test_dense_chol_lower(self):
-        # Dense matrix input lower-triangular Cholesky test.
-        for lower in [True, False]:
-            for inplace in [True, False]:
-                M_val = np.random.normal(size=(3, 3)).astype("float64")
-                # A = M.dot(M) will be positive definite for all non-singular M
-                A_val = M_val.dot(M_val.T)
-                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
-
-    def test_invalid_input_fail_non_symmetric(self):
-        # Invalid Cholesky input test with non-symmetric input.
-        #    (Non-symmetric real input must also be non-positive definite).
-        A_val = None
-        while True:
-            A_val = np.random.normal(size=(3, 3)).astype("float64")
-            if not np.allclose(A_val, A_val.T):
-                break
-        fn = self.get_gpu_cholesky_func(True, False)
-        with pytest.raises(LinAlgError):
-            fn(A_val)
-
-    def test_invalid_input_fail_negative_definite(self):
-        # Invalid Cholesky input test with negative-definite input.
-        M_val = np.random.normal(size=(3, 3)).astype("float64")
-        # A = -M.dot(M) will be negative definite for all non-singular M
-        A_val = -M_val.dot(M_val.T)
-        fn = self.get_gpu_cholesky_func(True, False)
-        with pytest.raises(LinAlgError):
-            fn(A_val)
-
-
-@pytest.mark.skipif(
-    not config.magma__enabled, reason="Magma is not enabled, skipping test"
-)
-class TestMagma:
-    def test_magma_opt_float16(self):
-        ops_to_gpu = [
-            (MatrixInverse(), GpuMagmaMatrixInverse),
-            (SVD(), GpuMagmaSVD),
-            (QRFull(mode="reduced"), GpuMagmaQR),
-            # TODO: add support for float16 to Eigh numpy
-            # (Eigh(), GpuMagmaEigh),
-            (Cholesky(), GpuMagmaCholesky),
-        ]
-        for op, gpu_op in ops_to_gpu:
-            A = matrix("A", dtype="float16")
-            fn = aesara.function([A], op(A), mode=mode_with_gpu.excluding("cusolver"))
-            assert any(
-                [isinstance(node.op, gpu_op) for node in fn.maker.fgraph.toposort()]
-            )
-
-    def test_gpu_matrix_inverse(self):
-        A = fmatrix("A")
-
-        fn = aesara.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
-        N = 1000
-        test_rng = np.random.default_rng(seed=1)
-        # Copied from tests.tensor.utils.random.
-        A_val = test_rng.random((N, N)).astype("float32") * 2 - 1
-        A_val_inv = fn(A_val)
-        utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=1e-2)
-
-    @utt.assertFailure_fast
-    def test_gpu_matrix_inverse_inplace(self):
-        N = 1000
-        test_rng = np.random.default_rng(seed=1)
-        A_val_gpu = gpuarray_shared_constructor(
-            test_rng.random((N, N)).astype("float32") * 2 - 1
-        )
-        A_val_copy = A_val_gpu.get_value()
-        A_val_gpu_inv = GpuMagmaMatrixInverse()(A_val_gpu)
-        fn = aesara.function(
-            [], A_val_gpu_inv, mode=mode_with_gpu, updates=[(A_val_gpu, A_val_gpu_inv)]
-        )
-        assert any(
-            [
-                node.op.inplace
-                for node in fn.maker.fgraph.toposort()
-                if isinstance(node.op, GpuMagmaMatrixInverse)
-            ]
-        )
-        fn()
-        utt.assert_allclose(
-            np.eye(N), np.dot(A_val_gpu.get_value(), A_val_copy), atol=5e-3
-        )
-
-    @utt.assertFailure_fast
-    def test_gpu_matrix_inverse_inplace_opt(self):
-        A = fmatrix("A")
-        fn = aesara.function([A], matrix_inverse(A), mode=mode_with_gpu)
-        assert any(
-            [
-                node.op.inplace
-                for node in fn.maker.fgraph.toposort()
-                if isinstance(node.op, GpuMagmaMatrixInverse)
-            ]
-        )
-
-    def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
-        A = fmatrix("A")
-        f = aesara.function(
-            [A],
-            gpu_svd(A, full_matrices=full_matrices, compute_uv=compute_uv),
-            mode=mode_with_gpu,
-        )
-        return f(A_val)
-
-    def assert_column_orthonormal(self, Ot):
-        utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))
-
-    def check_svd(self, A, U, S, VT, rtol=None, atol=None):
-        S_m = np.zeros_like(A)
-        np.fill_diagonal(S_m, S)
-        utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
-
-    def test_gpu_svd_wide(self):
-        A = random(100, 50).astype("float32")
-        M, N = A.shape
-
-        U, S, VT = self.run_gpu_svd(A)
-        self.assert_column_orthonormal(U)
-        self.assert_column_orthonormal(VT.T)
-        self.check_svd(A, U, S, VT)
-
-        U, S, VT = self.run_gpu_svd(A, full_matrices=False)
-        assert U.shape[1], min(M, N)
-        self.assert_column_orthonormal(U)
-        assert VT.shape[0], min(M, N)
-        self.assert_column_orthonormal(VT.T)
-
-    def test_gpu_svd_tall(self):
-        A = random(50, 100).astype("float32")
-        M, N = A.shape
-
-        U, S, VT = self.run_gpu_svd(A)
-        self.assert_column_orthonormal(U)
-        self.assert_column_orthonormal(VT.T)
-        self.check_svd(A, U, S, VT)
-
-        U, S, VT = self.run_gpu_svd(A, full_matrices=False)
-        assert U.shape[1], min(M, N)
-        self.assert_column_orthonormal(U)
-        assert VT.shape[0], min(M, N)
-        self.assert_column_orthonormal(VT.T)
-
-    def test_gpu_singular_values(self):
-        A = fmatrix("A")
-        f_cpu = aesara.function(
-            [A], aesara.tensor.nlinalg.svd(A, compute_uv=False), mode=mode_without_gpu
-        )
-        f_gpu = aesara.function([A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu)
-
-        A_val = random(50, 100).astype("float32")
-        utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
-
-        A_val = random(100, 50).astype("float32")
-        utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
-
-    def run_gpu_cholesky(self, A_val, lower=True):
-        A = fmatrix("A")
-        f = aesara.function(
-            [A],
-            GpuMagmaCholesky(lower=lower)(A),
-            mode=mode_with_gpu.excluding("cusolver"),
-        )
-        return f(A_val)
-
-    def rand_symmetric(self, N):
-        A = random(N, N).astype("float32")
-        # ensure that eigenvalues are not too small which sometimes results in
-        # magma cholesky failure due to gpu limited numerical precision
-        D, W = np.linalg.eigh(A)
-        D[D < 1] = 1
-        V_m = np.zeros_like(A)
-        np.fill_diagonal(V_m, D)
-        return np.dot(np.dot(W.T, V_m), W)
-
-    def check_cholesky(self, N, lower=True, rtol=None, atol=None):
-        A = self.rand_symmetric(N)
-        L = self.run_gpu_cholesky(A, lower=lower)
-        if not lower:
-            L = L.T
-        utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
-
-    def test_gpu_cholesky(self):
-        self.check_cholesky(1000, atol=1e-3)
-        self.check_cholesky(1000, lower=False, atol=1e-3)
-
-    def test_gpu_cholesky_opt(self):
-        A = matrix("A", dtype="float32")
-        fn = aesara.function([A], cholesky(A), mode=mode_with_gpu.excluding("cusolver"))
-        assert any(
-            [
-                isinstance(node.op, GpuMagmaCholesky)
-                for node in fn.maker.fgraph.toposort()
-            ]
-        )
-
-    @utt.assertFailure_fast
-    def test_gpu_cholesky_inplace(self):
-        A = self.rand_symmetric(1000)
-        A_gpu = gpuarray_shared_constructor(A)
-        A_copy = A_gpu.get_value()
-        C = GpuMagmaCholesky()(A_gpu)
-        fn = aesara.function([], C, mode=mode_with_gpu, updates=[(A_gpu, C)])
-        assert any(
-            [
-                node.op.inplace
-                for node in fn.maker.fgraph.toposort()
-                if isinstance(node.op, GpuMagmaCholesky)
-            ]
-        )
-        fn()
-        L = A_gpu.get_value()
-        utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
-
-    @utt.assertFailure_fast
-    def test_gpu_cholesky_inplace_opt(self):
-        A = fmatrix("A")
-        fn = aesara.function([A], GpuMagmaCholesky()(A), mode=mode_with_gpu)
-        assert any(
-            [
-                node.op.inplace
-                for node in fn.maker.fgraph.toposort()
-                if isinstance(node.op, GpuMagmaCholesky)
-            ]
-        )
-
-    def run_gpu_qr(self, A_val, complete=True):
-        A = fmatrix("A")
-        fn = aesara.function([A], gpu_qr(A, complete=complete), mode=mode_with_gpu)
-        return fn(A_val)
-
-    def check_gpu_qr(self, M, N, complete=True, rtol=None, atol=None):
-        A = random(M, N).astype("float32")
-        if complete:
-            Q_gpu, R_gpu = self.run_gpu_qr(A, complete=complete)
-        else:
-            R_gpu = self.run_gpu_qr(A, complete=complete)
-
-        Q_np, R_np = np.linalg.qr(A, mode="reduced")
-        utt.assert_allclose(R_np, R_gpu, rtol=rtol, atol=atol)
-        if complete:
-            utt.assert_allclose(Q_np, Q_gpu, rtol=rtol, atol=atol)
-
-    def test_gpu_qr(self):
-        self.check_gpu_qr(1000, 500, atol=1e-3)
-        self.check_gpu_qr(1000, 500, complete=False, atol=1e-3)
-        self.check_gpu_qr(500, 1000, atol=1e-3)
-        self.check_gpu_qr(500, 1000, complete=False, atol=1e-3)
-
-    def test_gpu_qr_opt(self):
-        A = fmatrix("A")
-        fn = aesara.function([A], qr(A), mode=mode_with_gpu)
-        assert any(
-            [
-                isinstance(node.op, GpuMagmaQR) and node.op.complete
-                for node in fn.maker.fgraph.toposort()
-            ]
-        )
-
-    def test_gpu_qr_incomplete_opt(self):
-        A = fmatrix("A")
-        fn = aesara.function([A], qr(A, mode="r"), mode=mode_with_gpu)
-        assert any(
-            [
-                isinstance(node.op, GpuMagmaQR) and not node.op.complete
-                for node in fn.maker.fgraph.toposort()
-            ]
-        )
-
-    def run_gpu_eigh(self, A_val, UPLO="L", compute_v=True):
-        A = fmatrix("A")
-        fn = aesara.function(
-            [A], GpuMagmaEigh(UPLO=UPLO, compute_v=compute_v)(A), mode=mode_with_gpu
-        )
-        return fn(A_val)
-
-    def check_gpu_eigh(self, N, UPLO="L", compute_v=True, rtol=None, atol=None):
-        A = random(N, N).astype("float32")
-        A = np.dot(A.T, A)
-        d_np, v_np = np.linalg.eigh(A, UPLO=UPLO)
-        if compute_v:
-            d_gpu, v_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=compute_v)
-        else:
-            d_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=False)
-        utt.assert_allclose(d_np, d_gpu, rtol=rtol, atol=atol)
-        if compute_v:
-            utt.assert_allclose(np.eye(N), np.dot(v_gpu, v_gpu.T), rtol=rtol, atol=atol)
-            D_m = np.zeros_like(A)
-            np.fill_diagonal(D_m, d_gpu)
-            utt.assert_allclose(
-                A, np.dot(np.dot(v_gpu, D_m), v_gpu.T), rtol=rtol, atol=atol
-            )
-
-    def test_gpu_eigh(self):
-        self.check_gpu_eigh(1000, UPLO="L", atol=1e-3)
-        self.check_gpu_eigh(1000, UPLO="U", atol=1e-3)
-        self.check_gpu_eigh(1000, UPLO="L", compute_v=False, atol=1e-3)
-        self.check_gpu_eigh(1000, UPLO="U", compute_v=False, atol=1e-3)
-
-    def test_gpu_eigh_opt(self):
-        A = fmatrix("A")
-        fn = aesara.function([A], eigh(A), mode=mode_with_gpu)
-        assert any(
-            [isinstance(node.op, GpuMagmaEigh) for node in fn.maker.fgraph.toposort()]
-        )
-
-
-# mostly copied from aesara/tensor/tests/test_slinalg.py
-def test_cholesky_grad():
-    rng = np.random.default_rng(utt.fetch_seed())
-    r = rng.standard_normal((5, 5)).astype(config.floatX)
-
-    # The dots are inside the graph since Cholesky needs separable matrices
-
-    # Check the default.
-    utt.verify_grad(lambda r: gpu_cholesky(r.dot(r.T)), [r], 3, rng)
-    # Explicit lower-triangular.
-    utt.verify_grad(lambda r: GpuCholesky(lower=True)(r.dot(r.T)), [r], 3, rng)
-    # Explicit upper-triangular.
-    utt.verify_grad(lambda r: GpuCholesky(lower=False)(r.dot(r.T)), [r], 3, rng)
-
-
-def test_cholesky_grad_indef():
-    x = matrix()
-    mat = np.array([[1, 0.2], [0.2, -2]]).astype(config.floatX)
-    cholesky = GpuCholesky(lower=True)
-    chol_f = aesara.function([x], aesara.gradient.grad(cholesky(x).sum(), [x]))
-    with pytest.raises(LinAlgError):
-        chol_f(mat)
-    # cholesky = GpuCholesky(lower=True, on_error='nan')
-    # chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
-    # assert np.all(np.isnan(chol_f(matrix)))
-
-
-def test_lower_triangular_and_cholesky_grad():
-    # Random lower triangular system is ill-conditioned.
-    #
-    # Reference
-    # -----------
-    # Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
-    # SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
-    #
-    # Use smaller number of N when using float32
-    if config.floatX == "float64":
-        N = 100
-    else:
-        N = 5
-    rng = np.random.default_rng(utt.fetch_seed())
-    r = rng.standard_normal((N, N)).astype(config.floatX)
-    y = rng.random((N, 1)).astype(config.floatX)
-
-    def f(r, y):
-        PD = r.dot(r.T)
-        L = gpu_cholesky(PD)
-        A = gpu_solve_lower_triangular(L, y)
-        AAT = aesara.tensor.dot(A, A.T)
-        B = AAT + aesara.tensor.eye(N)
-        LB = gpu_cholesky(B)
-        return aesara.tensor.sum(aesara.tensor.log(aesara.tensor.diag(LB)))
-
-    utt.verify_grad(f, [r, y], 3, rng)
--- a/tests/gpuarray/test_misc.py
+++ b/tests/gpuarray/test_misc.py
-# Test that normally could be outside gpuarray, to have all gpuarray
-# tests in the same directory, we put them here.
-import numpy as np
-
-import aesara
-from aesara.compile.nanguardmode import NanGuardMode
-from aesara.tensor.type import vector
-from tests.gpuarray.config import mode_with_gpu
-
-
-def test_nan_guard_mode():
-    # Also test that abs uint* and bool have c code.
-    for dtype in ["uint8", "int64", "bool"]:
-        x = vector(dtype=dtype)
-        y = x + 1
-        mode = NanGuardMode(nan_is_error=True, optimizer=mode_with_gpu.optimizer)
-        f = aesara.function([x], y, mode=mode)
-        d = np.asarray([23, 7]).astype(dtype)
-        assert np.allclose(f(d), d + 1)
--- a/tests/gpuarray/test_multinomial.py
+++ b/tests/gpuarray/test_multinomial.py
-import numpy as np
-import pytest
-
-import aesara
-import tests.unittest_tools as utt
-from aesara import function
-from aesara.configdefaults import config
-from aesara.gpuarray.multinomial import (
-    GPUAChoiceFromUniform,
-    GPUAMultinomialFromUniform,
-)
-from aesara.sandbox import multinomial
-from aesara.sandbox.rng_mrg import MRG_RandomStream as RandomStream
-from aesara.tensor.type import fmatrix, frow, fvector, iscalar, matrix, vector
-from tests.gpuarray.config import mode_with_gpu
-
-
-def test_multinomial_output_dtype():
-    # This tests the MultinomialFromUniform Op directly, not going through the
-    # multinomial() call in GPU random generation.
-
-    p = fmatrix()
-    u = fvector()
-
-    for dtype in ["int64", "float32", "float16", "float64", "int32", "auto"]:
-        m = aesara.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
-
-        # the m*2 allows the multinomial to reuse output
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
-
-        assert any(
-            [
-                type(node.op) is GPUAMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-
-        # test that both first and second samples can be drawn
-        utt.assert_allclose(f([[1, 0], [0, 1]], [0.1, 0.1]), [[2, 0], [0, 2]])
-
-        # test that both second labels can be drawn
-        r = f([[0.2, 0.8], [0.3, 0.7]], [0.31, 0.31])
-        utt.assert_allclose(r, [[0, 2], [0, 2]])
-
-        # test that both first labels can be drawn
-        r = f([[0.2, 0.8], [0.3, 0.7]], [0.21, 0.21])
-        utt.assert_allclose(r, [[0, 2], [2, 0]])
-
-        # change the size to make sure output gets reallocated ok
-        # and also make sure that the GPU version doesn't screw up the
-        # transposed-ness
-        r = f([[0.2, 0.8]], [0.25])
-        utt.assert_allclose(r, [[0, 2]])
-
-
-def test_multinomial_input_dtype():
-    # This tests the MultinomialFromUniform Op directly, not going through the
-    # multinomial() call in GPU random generation.
-
-    for idtype in ["float32", "float16", "float64"]:
-        for odtype in ["float32", "float16", "float64", "int32"]:
-
-            p = matrix("p", idtype)
-            u = vector("u", idtype)
-            # p = dmatrix('p')
-            # u = dvector('u')
-            m = aesara.sandbox.multinomial.MultinomialFromUniform(odtype)(p, u)
-
-            # the m*2 allows the multinomial to reuse output
-            f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
-
-            assert any(
-                [
-                    type(node.op) is GPUAMultinomialFromUniform
-                    for node in f.maker.fgraph.toposort()
-                ]
-            )
-
-            # test that both first and second samples can be drawn
-            utt.assert_allclose(f([[1, 0], [0, 1]], [0.1, 0.1]), [[2, 0], [0, 2]])
-
-            # test that both second labels can be drawn
-            r = f([[0.2, 0.8], [0.3, 0.7]], [0.31, 0.31])
-            utt.assert_allclose(r, [[0, 2], [0, 2]])
-
-            # test that both first labels can be drawn
-            r = f([[0.2, 0.8], [0.3, 0.7]], [0.21, 0.21])
-            utt.assert_allclose(r, [[0, 2], [2, 0]])
-
-            # change the size to make sure output gets reallocated ok
-            # and also make sure that the GPU version doesn't screw up the
-            # transposed-ness
-            r = f([[0.2, 0.8]], [0.25])
-            utt.assert_allclose(r, [[0, 2]])
-
-
-# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
-def test_multinomial_large():
-    # DEBUG_MODE will test this on GPU
-    p = fmatrix()
-    u = fvector()
-    m = aesara.sandbox.multinomial.MultinomialFromUniform("auto")(p, u)
-    f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
-    assert any(
-        [
-            type(node.op) is GPUAMultinomialFromUniform
-            for node in f.maker.fgraph.toposort()
-        ]
-    )
-
-    pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    mval = f(pval, uval)
-
-    assert mval.shape == pval.shape
-    if config.cast_policy == "custom":
-        assert mval.dtype == pval.dtype
-    elif config.cast_policy == "numpy+floatX":
-        assert mval.dtype == config.floatX
-    elif config.cast_policy == "numpy":
-        assert mval.dtype == "float64"
-    else:
-        raise NotImplementedError(config.cast_policy)
-    utt.assert_allclose(mval.sum(axis=1), 2)
-    asdf = np.asarray([0, 0, 2, 0]) + 0 * pval
-    utt.assert_allclose(mval, asdf)  # broadcast over all rows
-
-
-def test_gpu_opt_dtypes():
-    # Test if the returned samples are of the datatype specified
-    for dtype in ["uint32", "float32", "int64", "float64"]:
-        p = fmatrix()
-        u = fvector()
-        m = aesara.sandbox.multinomial.MultinomialFromUniform(dtype)(p, u)
-
-        f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
-        assert any(
-            [
-                type(node.op) is GPUAMultinomialFromUniform
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-        pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
-        pval = pval / pval.sum(axis=1)[:, None]
-        uval = np.ones_like(pval[:, 0]) * 0.5
-        samples = f(pval, uval)
-        assert samples.dtype == dtype, f"{samples.dtype} != {dtype}"
-
-
-def test_gpu_opt():
-    # Does have some overlap with test_multinomial_0
-
-    # We test the case where we put the op on the gpu when the output
-    # is moved to the gpu.
-    p = fmatrix()
-    u = fvector()
-    m = aesara.sandbox.multinomial.MultinomialFromUniform("auto")(p, u)
-    assert m.dtype == "float32", m.dtype
-
-    f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
-    assert any(
-        [
-            type(node.op) is GPUAMultinomialFromUniform
-            for node in f.maker.fgraph.toposort()
-        ]
-    )
-    pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    f(pval, uval)
-
-    # Test with a row, it was failing in the past.
-    r = frow()
-    m = aesara.sandbox.multinomial.MultinomialFromUniform("auto")(r, u)
-    assert m.dtype == "float32", m.dtype
-
-    f = function([r, u], m, allow_input_downcast=True, mode=mode_with_gpu)
-    assert any(
-        [
-            type(node.op) is GPUAMultinomialFromUniform
-            for node in f.maker.fgraph.toposort()
-        ]
-    )
-    pval = np.arange(1 * 4, dtype="float32").reshape((1, 4)) + 0.1
-    pval = pval / pval.sum(axis=1)[:, None]
-    uval = np.ones_like(pval[:, 0]) * 0.5
-    f(pval, uval)
-
-
-class TestOPWor:
-    def test_select_distinct(self):
-        # Tests that ChoiceFromUniform always selects distinct elements
-
-        p = fmatrix()
-        u = fvector()
-        n = iscalar()
-        m = multinomial.ChoiceFromUniform(odtype="auto")(p, u, n)
-
-        f = function([p, u, n], m, allow_input_downcast=True)
-
-        n_elements = 1000
-        all_indices = range(n_elements)
-        np.random.seed(12345)
-        for i in [5, 10, 50, 100, 500, n_elements]:
-            uni = np.random.rand(i).astype(config.floatX)
-            pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
-            pvals /= pvals.sum(1)
-            res = f(pvals, uni, i)
-            res = np.squeeze(res)
-            assert len(res) == i, res
-            assert np.all(np.in1d(np.unique(res), all_indices)), res
-
-    def test_fail_select_alot(self):
-        # Tests that ChoiceFromUniform fails when asked to sample more
-        # elements than the actual number of elements
-
-        p = fmatrix()
-        u = fvector()
-        n = iscalar()
-        m = multinomial.ChoiceFromUniform(odtype="auto")(p, u, n)
-
-        f = function([p, u, n], m, allow_input_downcast=True)
-
-        n_elements = 100
-        n_selected = 200
-        np.random.seed(12345)
-        uni = np.random.rand(n_selected).astype(config.floatX)
-        pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
-        pvals /= pvals.sum(1)
-        with pytest.raises(ValueError):
-            f(pvals, uni, n_selected)
-
-    def test_select_proportional_to_weight(self):
-        # Tests that ChoiceFromUniform selects elements, on average,
-        # proportional to the their probabilities
-
-        p = fmatrix()
-        u = fvector()
-        n = iscalar()
-        m = multinomial.ChoiceFromUniform(odtype="auto")(p, u, n)
-
-        f = function([p, u, n], m, allow_input_downcast=True)
-
-        n_elements = 100
-        n_selected = 10
-        mean_rtol = 0.0005
-        np.random.seed(12345)
-        pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
-        pvals /= pvals.sum(1)
-        avg_pvals = np.zeros((n_elements,), dtype=config.floatX)
-
-        for rep in range(10000):
-            uni = np.random.rand(n_selected).astype(config.floatX)
-            res = f(pvals, uni, n_selected)
-            res = np.squeeze(res)
-            avg_pvals[res] += 1
-        avg_pvals /= avg_pvals.sum()
-        avg_diff = np.mean(abs(avg_pvals - pvals))
-        assert avg_diff < mean_rtol, avg_diff
-
-
-class TestFunctionWor:
-    def test_select_distinct(self):
-        # Tests that multinomial_wo_replacement always selects distinct elements
-
-        th_rng = RandomStream(12345)
-
-        p = fmatrix()
-        n = iscalar()
-        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
-
-        f = function([p, n], m, allow_input_downcast=True)
-
-        n_elements = 1000
-        all_indices = range(n_elements)
-        np.random.seed(12345)
-        for i in [5, 10, 50, 100, 500, n_elements]:
-            pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
-            pvals /= pvals.sum(1)
-            res = f(pvals, i)
-            res = np.squeeze(res)
-            assert len(res) == i
-            assert np.all(np.in1d(np.unique(res), all_indices)), res
-
-    def test_fail_select_alot(self):
-        # Tests that multinomial_wo_replacement fails when asked to sample more
-        # elements than the actual number of elements
-
-        th_rng = RandomStream(12345)
-
-        p = fmatrix()
-        n = iscalar()
-        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
-
-        f = function([p, n], m, allow_input_downcast=True)
-
-        n_elements = 100
-        n_selected = 200
-        np.random.seed(12345)
-        pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
-        pvals /= pvals.sum(1)
-        with pytest.raises(ValueError):
-            f(pvals, n_selected)
-
-    def test_select_proportional_to_weight(self):
-        # Tests that multinomial_wo_replacement selects elements, on average,
-        # proportional to the their probabilities
-
-        th_rng = RandomStream(12345)
-
-        p = fmatrix()
-        n = iscalar()
-        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
-
-        f = function([p, n], m, allow_input_downcast=True)
-
-        n_elements = 100
-        n_selected = 10
-        mean_rtol = 0.0005
-        np.random.seed(12345)
-        pvals = np.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
-        pvals /= pvals.sum(1)
-        avg_pvals = np.zeros((n_elements,), dtype=config.floatX)
-
-        for rep in range(10000):
-            res = f(pvals, n_selected)
-            res = np.squeeze(res)
-            avg_pvals[res] += 1
-        avg_pvals /= avg_pvals.sum()
-        avg_diff = np.mean(abs(avg_pvals - pvals))
-        assert avg_diff < mean_rtol
-
-
-def test_gpu_opt_wor():
-    # We test the case where we put the op on the gpu when the output
-    # is moved to the gpu.
-    p = fmatrix()
-    u = fvector()
-    n = iscalar()
-    for replace in [False, True]:
-        m = multinomial.ChoiceFromUniform(odtype="auto", replace=replace)(p, u, n)
-        assert m.dtype == "int64", m.dtype
-
-        f = function([p, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
-        assert any(
-            [
-                type(node.op) is GPUAChoiceFromUniform
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-        n_samples = 3
-        pval = np.arange(10000 * 4, dtype="float32").reshape((10000, 4)) + 0.1
-        pval = pval / pval.sum(axis=1)[:, None]
-        uval = np.ones(pval.shape[0] * n_samples) * 0.5
-        f(pval, uval, n_samples)
-
-        # Test with a row, it was failing in the past.
-        r = frow()
-        m = multinomial.ChoiceFromUniform("auto", replace=replace)(r, u, n)
-        assert m.dtype == "int64", m.dtype
-
-        f = function([r, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
-        assert any(
-            [
-                type(node.op) is GPUAChoiceFromUniform
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-        pval = np.arange(1 * 4, dtype="float32").reshape((1, 4)) + 0.1
-        pval = pval / pval.sum(axis=1)[:, None]
-        uval = np.ones_like(pval[:, 0]) * 0.5
-        f(pval, uval, 1)
--- a/tests/gpuarray/test_neighbours.py
+++ b/tests/gpuarray/test_neighbours.py
-from aesara.gpuarray.neighbours import GpuImages2Neibs
-from tests.gpuarray.config import mode_with_gpu
-from tests.tensor.nnet import test_neighbours
-
-
-class TestGpuImages2Neibs(test_neighbours.TestImages2Neibs):
-    mode = mode_with_gpu
-    op = GpuImages2Neibs
-    dtypes = ["int64", "float32", "float64"]
--- a/tests/gpuarray/test_nnet.py
+++ b/tests/gpuarray/test_nnet.py
-import numpy as np
-
-import aesara
-import aesara.tensor as at
-import tests.unittest_tools as utt
-from aesara.gpuarray.nnet import (
-    GpuCrossentropySoftmax1HotWithBiasDx,
-    GpuCrossentropySoftmaxArgmax1HotWithBias,
-    GpuSoftmax,
-    GpuSoftmaxWithBias,
-)
-from aesara.gradient import grad
-from aesara.tensor.math import argmax, log, mean
-from aesara.tensor.nnet import crossentropy_softmax_1hot_with_bias_dx
-from aesara.tensor.type import fmatrix, fvector, lvector, matrix, vector
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
-
-
-mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
-
-
-def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
-    # This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
-    # We check that we loop when their is too much threads
-
-    n_in = 1000
-    batch_size = 4097
-    n_out = 1250
-
-    if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
-        n_in = 4098
-        n_out = 4099
-
-    y = lvector("y")
-
-    b = fvector("b")
-
-    # we precompute the dot with big shape before to allow the test of
-    # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
-    # (the launch timed out and was terminated) on GPU card not
-    # powerful enough. We need the big shape to check for corner
-    # case.
-    dot_result = fmatrix("dot_result")
-
-    xx = np.asarray(np.random.rand(batch_size, n_in), dtype=np.float32)
-    yy = np.ones((batch_size,), dtype="int32")
-    b_values = np.zeros((n_out,), dtype="float32")
-    W_values = np.asarray(np.random.rand(n_in, n_out), dtype="float32")
-
-    dot_value = np.asarray(np.dot(xx, W_values), dtype="float32")
-    del W_values
-    p_y_given_x = aesara.tensor.nnet.softmax(dot_result + b)
-    y_pred = argmax(p_y_given_x, axis=-1)
-    loss = -mean(log(p_y_given_x)[at.arange(y.shape[0]), y])
-    dW = grad(loss, dot_result)
-    classify = aesara.function(
-        inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu
-    )
-    classify_gpu = aesara.function(
-        inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu
-    )
-
-    assert any(
-        [
-            isinstance(
-                node.op, aesara.tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias
-            )
-            for node in classify.maker.fgraph.toposort()
-        ]
-    )
-    assert any(
-        [
-            isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias)
-            for node in classify_gpu.maker.fgraph.toposort()
-        ]
-    )
-
-    out = classify(yy, b_values, dot_value)
-    gout = classify_gpu(yy, b_values, dot_value)
-
-    assert len(out) == len(gout) == 3
-    utt.assert_allclose(out[0], gout[0])
-    utt.assert_allclose(out[2], gout[2], atol=3e-6)
-    utt.assert_allclose(out[1], gout[1])
-
-
-def test_GpuCrossentropySoftmax1HotWithBiasDx():
-    # This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
-    # We check that we loop when their is too much threads
-
-    batch_size = 4097
-    n_out = 1250
-
-    if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
-        n_out = 4099
-
-    softmax_output_value = np.random.rand(batch_size, n_out).astype("float32")
-    dnll_value = np.asarray(np.random.rand(batch_size), dtype="float32")
-    y_idx_value = np.random.randint(low=0, high=5, size=batch_size)
-
-    softmax_output = fmatrix()
-    softmax_output /= softmax_output.sum(axis=1).reshape(softmax_output.shape[1], 1)
-    op = crossentropy_softmax_1hot_with_bias_dx(dnll_value, softmax_output, y_idx_value)
-
-    cpu_f = aesara.function([softmax_output], op, mode=mode_without_gpu)
-    gpu_f = aesara.function([softmax_output], op, mode=mode_with_gpu)
-    # aesara.printing.debugprint(cpu_f)
-    # aesara.printing.debugprint(gpu_f)
-
-    assert any(
-        [
-            isinstance(node.op, aesara.tensor.nnet.CrossentropySoftmax1HotWithBiasDx)
-            for node in cpu_f.maker.fgraph.toposort()
-        ]
-    )
-    assert any(
-        [
-            isinstance(node.op, GpuCrossentropySoftmax1HotWithBiasDx)
-            for node in gpu_f.maker.fgraph.toposort()
-        ]
-    )
-
-    cpu_out = cpu_f(softmax_output_value)
-    gpu_out = gpu_f(softmax_output_value)
-
-    rtol = 1e-5
-    atol = 1e-6
-    utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
-
-
-def test_softmax_with_bias_float16():
-    softmax_with_bias_unittest_template(dtypeInput="float16", dtypeBias="float32")
-    softmax_with_bias_unittest_template(dtypeInput="float16", dtypeBias="float16")
-    softmax_with_bias_unittest_template(dtypeInput="float32", dtypeBias="float16")
-
-
-def test_softmax_with_bias_float32():
-    softmax_with_bias_unittest_template(dtypeInput="float32", dtypeBias="float32")
-
-
-def test_softmax_with_bias_float64():
-    softmax_with_bias_unittest_template(dtypeInput="float32", dtypeBias="float64")
-    softmax_with_bias_unittest_template(dtypeInput="float64", dtypeBias="float32")
-    softmax_with_bias_unittest_template(dtypeInput="float64", dtypeBias="float64")
-
-
-def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
-    # This is a basic test for GpuSoftmaxWithBias.
-    #
-    # We check that we loop when there are too many blocks.
-    #
-    # TODO: check that we loop when there are too many threads. (THIS IS
-    # NOT IMPLEMENTED)
-
-    x = matrix("x", dtype=dtypeInput)
-    b = vector("b", dtype=dtypeBias)
-
-    z = aesara.tensor.nnet.softmax_with_bias(x, b)
-
-    f = aesara.function([x, b], z, mode=mode_without_gpu)
-    f_gpu = aesara.function([x, b], z, mode=mode_with_gpu)
-    assert f.maker.fgraph.toposort()[-1].op == aesara.tensor.nnet.softmax_with_bias
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, GpuSoftmaxWithBias)
-
-    def cmp(n, m):
-        data = np.random.uniform(1e-7, 1, (n, m)).astype(dtype=dtypeInput)
-        b_data = np.random.uniform(1e-7, 1, (m,)).astype(dtype=dtypeBias)
-
-        out = f(data, b_data)
-        gout = f_gpu(data, b_data)
-        utt.assert_allclose(out, gout)
-
-    cmp(2, 5)
-    # we need to test n>32*1024 to check that we make the block loop.
-    cmp(2 << 15, 5)
-    cmp(4074, 400)
-    cmp(784, 784)
-    cmp(4, 1000)
-    cmp(4, 1024)
-    cmp(4, 2000)
-    cmp(4, 2024)
-    # GTX285 don't have enough shared mem for this case.
-    cmp(4, 4074)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
-    cmp(128, 16 * 1024)
-    cmp(128, 64 * 1024)
-
-
-def test_softmax_float16():
-    softmax_unittest_template("float16")
-
-
-def test_softmax_float32():
-    softmax_unittest_template("float32")
-
-
-def test_softmax_float64():
-    softmax_unittest_template("float64")
-
-
-def softmax_unittest_template(dtypeInput):
-    # This is basic test for GpuSoftmax.
-    #
-    # We check that we loop when their is too much block
-    # We use slower code when there isn't enough shared memory
-
-    x = matrix("x", dtype=dtypeInput)
-
-    z = aesara.tensor.nnet.softmax(x)
-    f = aesara.function([x], z, mode=mode_without_gpu)
-    f_gpu = aesara.function([x], z, mode=mode_wo_cudnn)
-    assert f.maker.fgraph.toposort()[-1].op == aesara.tensor.nnet.softmax_legacy
-    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op, GpuSoftmax)
-
-    def cmp(n, m):
-        data = np.random.uniform(0, 1, (n, m)).astype(dtype=dtypeInput)
-
-        out = f(data)
-        gout = f_gpu(data)
-        utt.assert_allclose(out, gout)
-
-    # we need to test n>32*1024 to check that we make the block loop.
-    cmp(2, 5)
-    cmp(2 << 15, 5)
-    cmp(4074, 400)
-    cmp(784, 784)
-    cmp(4, 1000)
-    cmp(4, 1024)
-    cmp(4, 2000)
-    cmp(4, 2024)
-    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074)
-    # The GTX580, 680 and kepler don't have enough shared memory.
-    cmp(2, 10000)
-    cmp(128, 16 * 1024)
-    cmp(128, 64 * 1024)
-
-
-class TestSoftMax:
-    gpu_op = GpuSoftmax
-    mode = mode_wo_cudnn
-
-    def _test_softmax(self, x, x_gpu, f_z, f_gpu_z, cmp):
-        # This is basic test for GpuSoftmax and GpuDnnSoftmax
-        #
-        # We check that we loop when there is too much block
-        # We use slower code when there isn't enough shared memory
-
-        f_z_out = f_z(x)
-        f_gpu_z_out = f_gpu_z(x_gpu)
-
-        f = aesara.function([x], f_z_out, mode=mode_without_gpu)
-        f_gpu = aesara.function([x_gpu], f_gpu_z_out, mode=self.mode)
-        self._check_types(f, f_gpu, aesara.tensor.nnet.Softmax, self.gpu_op)
-
-        # we need to test n>32*1024 to check that we make the block loop.
-        cmp(1, 5, f, f_gpu)
-        cmp(2, 5, f, f_gpu)
-        cmp(10, 5, f, f_gpu)
-        cmp(100, 5, f, f_gpu)
-        cmp(1000, 5, f, f_gpu)
-        cmp(10000, 5, f, f_gpu)
-        cmp(4074, 400, f, f_gpu)
-        cmp(784, 784, f, f_gpu)
-        cmp(4, 1000, f, f_gpu)
-        cmp(4, 1024, f, f_gpu)
-        cmp(4, 2000, f, f_gpu)
-        cmp(4, 2024, f, f_gpu)
-        # The GTX285 don't have enough shared memory.
-        cmp(4, 4074, f, f_gpu)
-        # The GTX580, 680 and kepler don't have enough shared memory.
-        cmp(2, 10000, f, f_gpu)
-        cmp(128, 16 * 1024, f, f_gpu)
-        cmp(128, 64 * 1024, f, f_gpu)
-        # cudnn permits no more than 2^15 - 1 rows
-        cmp((2 << 15) - 1, 5, f, f_gpu)
-        cmp(5, 2 << 15, f, f_gpu)
-
-        return f, f_gpu
-
-    def _cmp(self, n, m, f, f_gpu):
-        data = np.arange(n * m, dtype="float32").reshape(n, m)
-        out = f(data)
-        gout = f_gpu(data)
-        utt.assert_allclose(out, gout)
-
-    def _check_types(self, graph, graph_gpu, f_type, f_gpu_type):
-        assert isinstance(graph.maker.fgraph.toposort()[-1].op, f_type)
-        assert (
-            len(
-                [
-                    node
-                    for node in graph_gpu.maker.fgraph.toposort()
-                    if isinstance(node.op, f_gpu_type)
-                ]
-            )
-            == 1
-        )
-
-    def test_softmax(self):
-        x = fmatrix("x")
-        z = aesara.tensor.nnet.softmax_legacy
-
-        f, f_gpu = self._test_softmax(x, x, z, z, self._cmp)
-
-        self._cmp(2 << 15, 5, f, f_gpu)
-
-    def test_softmax_shape_0(self):
-        x = fmatrix("x")
-        z = aesara.tensor.nnet.softmax_legacy
-
-        f, f_gpu = self._test_softmax(x, x, z, z, self._cmp)
-        # Aesara can handle that case, but cudnn can't
-        self._cmp(0, 10, f, f_gpu)
--- a/tests/gpuarray/test_opt.py
+++ b/tests/gpuarray/test_opt.py
-import numpy as np
-import pytest
-
-import aesara
-import aesara.gpuarray
-import aesara.tensor.slinalg as slinalg
-from aesara import tensor as at
-from aesara.breakpoint import PdbBreakpoint
-from aesara.configdefaults import config
-from aesara.gpuarray import basic_ops, blas, dnn, opt
-from aesara.gpuarray.basic_ops import (
-    GpuAlloc,
-    GpuAllocEmpty,
-    GpuFromHost,
-    GpuReshape,
-    HostFromGpu,
-    host_from_gpu,
-)
-from aesara.gpuarray.blas import GpuGemm
-from aesara.gpuarray.dnn import GpuDnnReduction
-from aesara.gpuarray.elemwise import (
-    Elemwise,
-    GpuCAReduceCPY,
-    GpuCAReduceCuda,
-    GpuElemwise,
-    max_inputs_to_GpuElemwise,
-)
-from aesara.gpuarray.linalg import GpuCholesky, GpuCusolverSolve, cusolver_available
-from aesara.gpuarray.subtensor import GpuSubtensor
-from aesara.gpuarray.type import GpuArrayType, get_context, gpuarray_shared_constructor
-from aesara.graph.opt import check_stack_trace
-from aesara.raise_op import Assert, assert_op
-from aesara.tensor.basic import Alloc, AllocEmpty, MakeVector, Rebroadcast
-from aesara.tensor.blas import batched_dot
-from aesara.tensor.math import dot, eq, exp, gt, tanh
-from aesara.tensor.nnet import abstract_conv
-from aesara.tensor.type import (
-    TensorType,
-    bmatrix,
-    cscalar,
-    fmatrix,
-    fscalar,
-    ftensor4,
-    iscalar,
-    ivector,
-    lscalar,
-    lvector,
-    matrix,
-    scalar,
-    tensor3,
-    vector,
-)
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu, test_ctx_name
-from tests.tensor.test_basic import TestSpecifyShape
-from tests.test_ifelse import TestIfelse
-
-
-def _check_stack_trace(thing):
-    from aesara.tensor.shape import Shape, Shape_i
-
-    def _ops_to_check(op):
-        if not isinstance(op, aesara.graph.op.Op):
-            op = op.op  # assume it is an apply node
-        return not isinstance(
-            op,
-            (
-                Shape_i,
-                Shape,
-                aesara.compile.ops.DeepCopyOp,
-                MakeVector,
-                aesara.tensor.subtensor.Subtensor,
-                aesara.tensor.elemwise.Elemwise,
-                aesara.ifelse.IfElse,
-                GpuFromHost,
-                HostFromGpu,
-            ),
-        )
-
-    return check_stack_trace(thing, ops_to_check=_ops_to_check, bug_print="ignore")
-
-
-def test_local_assert():
-    x = fmatrix()
-    a = assert_op(x, eq(x, 0).any())
-    f = aesara.function([x], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, Assert)]
-    assert len(a_op) == 1
-    assert isinstance(a_op[0].inputs[0].type, GpuArrayType)
-
-
-def test_local_remove_all_assert():
-    x = fmatrix()
-    a = assert_op(x, eq(x, 0).any())
-
-    # By default `unsafe` should not be there
-    f = aesara.function([x], a, mode=mode_with_gpu.excluding("unsafe"))
-    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, Assert)]
-    assert len(a_op) == 1
-
-    # Put `unsafe`
-    f = aesara.function([x], a, mode=mode_with_gpu.including("unsafe"))
-    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, Assert)]
-    assert len(a_op) == 0
-
-    # Remove `unsafe`
-    f = aesara.function([x], a, mode=mode_with_gpu.excluding("unsafe"))
-    topo = f.maker.fgraph.toposort()
-    a_op = [n for n in topo if isinstance(n.op, Assert)]
-    assert len(a_op) == 1
-
-
-def test_local_gpu_contiguous_gpu_contiguous():
-    a = fmatrix()
-    o1 = basic_ops.gpu_contiguous(a)
-    o2 = basic_ops.gpu_contiguous(o1)
-    f1 = aesara.function([a], o1, mode=mode_with_gpu)
-    f2 = aesara.function([a], o2, mode=mode_with_gpu)
-    assert 1 == len(
-        [
-            node
-            for node in f1.maker.fgraph.toposort()
-            if isinstance(node.op, basic_ops.GpuContiguous)
-        ]
-    )
-    assert 1 == len(
-        [
-            node
-            for node in f2.maker.fgraph.toposort()
-            if isinstance(node.op, basic_ops.GpuContiguous)
-        ]
-    )
-    assert _check_stack_trace(f1)
-    assert _check_stack_trace(f2)
-
-
-def test_local_gpu_contiguous():
-    a = fmatrix()
-    o = aesara.tensor.extra_ops.cpu_contiguous(a)
-    f = aesara.function([a], o, mode=mode_with_gpu)
-    assert 1 == len(
-        [
-            node
-            for node in f.maker.fgraph.toposort()
-            if isinstance(node.op, basic_ops.GpuContiguous)
-        ]
-    )
-    f([[2.0]])
-    assert _check_stack_trace(f)
-
-
-def test_flatten():
-    m = fmatrix()
-    f = aesara.function([m], m.flatten(), mode=mode_with_gpu)
-    val = np.random.rand(10, 11).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val.flatten())
-    assert res.shape == val.flatten().shape
-    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
-    val = np.random.rand(10, 11).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val.flatten())
-    assert res.shape == val.flatten().shape
-    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
-    assert _check_stack_trace(f)
-
-    f = aesara.function(
-        [m], m.flatten(ndim=2), mode=mode_with_gpu.excluding("local_useless_reshape")
-    )
-    val = np.random.rand(10, 11).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val)
-    assert res.shape == val.shape
-    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
-    assert _check_stack_trace(f)
-
-    m = tensor3()
-    f = aesara.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
-    val = np.random.rand(10, 11, 12).astype("float32")
-    res = f(val)
-    utt.assert_allclose(res, val.reshape(10, -1))
-    assert res.shape == val.reshape(10, -1).shape
-    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
-    assert _check_stack_trace(f)
-
-
-def test_reduce():
-    kind = get_context(test_ctx_name).kind
-
-    for method, param in [
-        ("sum", dict(acc_dtype="float32")),
-        ("prod", dict(acc_dtype="float32")),
-        ("max", {}),
-        ("min", {}),
-    ]:
-        m = fmatrix()
-        f = aesara.function(
-            [m], getattr(m, method)(axis=0, **param), mode=mode_with_gpu
-        )
-        # assert _check_stack_trace(f) this op is ok but since
-        # it is using GpuCAReduceCuda that has an empty stack
-        # trace, this assertion gives error.
-        val = np.random.rand(10, 11).astype("float32")
-        res = f(val)
-        utt.assert_allclose(res, getattr(val, method)(axis=0))
-        assert res.shape == (11,)
-        topo = f.maker.fgraph.toposort()
-        ops = [type(node.op) for node in topo]
-
-        if kind == b"opencl" and method in ["max", "min"]:
-            assert not (
-                GpuCAReduceCuda in ops
-                or GpuCAReduceCPY in ops
-                or GpuDnnReduction in ops
-            )
-        else:
-            assert (
-                GpuCAReduceCuda in ops
-                or GpuCAReduceCPY in ops
-                or GpuDnnReduction in ops
-            )
-
-
-def test_local_gpualloc_memset_0():
-    i = iscalar()
-    z = np.zeros((1,), dtype="float32")
-    o = np.ones((1,), dtype="float32")
-    ones = np.ones((2,), dtype="float32")
-
-    # Test with 0 from CPU op.
-    # Should not be transferred as the only client is the output
-    a = at.alloc(z, i)
-    f = aesara.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, Alloc)
-    assert (np.asarray(f(6)) == 0).all()
-    assert _check_stack_trace(f)
-
-    # Test with 0 from CPU op.
-    # Should be transferred as it is used by another op.
-    a = at.alloc(z, i)
-    f = aesara.function([i], a.cumsum(), mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert isinstance(topo[0].op, GpuAlloc)
-    assert (np.asarray(f(6)) == 0).all()
-    assert _check_stack_trace(f)
-
-    # Test with 0
-    a = GpuAlloc(test_ctx_name)(z, i)
-    f = aesara.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
-    assert (np.asarray(f(6)) == 0).all()
-    assert _check_stack_trace(f)
-
-    # Test with 1
-    a = GpuAlloc(test_ctx_name)(o, i)
-    f = aesara.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc)
-    assert not topo[0].op.memset_0
-    assert (np.asarray(f(6)) == 1).all()
-    assert _check_stack_trace(f)
-
-    # Test with 1, 1
-    a = GpuAlloc(test_ctx_name)(ones, i)
-    f = aesara.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuAlloc)
-    assert not topo[0].op.memset_0
-    assert (np.asarray(f(2)) == 1).all()
-    assert _check_stack_trace(f)
-
-
-def test_local_gpualloc_empty():
-    i = iscalar()
-    ii = iscalar()
-
-    # Test with vector
-    # Should not be moved as the only client is the output
-    a = AllocEmpty("float32")(i)
-    f = aesara.function([i], a, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 1
-    assert isinstance(topo[0].op, AllocEmpty)
-    # This return not initialized data, so we can only check the shape
-    assert f(3).shape == (3,)
-    assert _check_stack_trace(f)
-
-    # Test with vector
-    # Should be moved
-    a = AllocEmpty("float32")(i)
-    f = aesara.function([i], a.cumsum(), mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert isinstance(topo[0].op, GpuAllocEmpty)
-    # This return not initialized data, so we can only check the shape
-    assert f(3).shape == (3,)
-    assert _check_stack_trace(f)
-
-    # Test with matrix
-    a = AllocEmpty("float32")(i, ii)
-    f = aesara.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert isinstance(topo[0].op, GpuAllocEmpty)
-    # This return not initialized data, so we can only check the shape
-    assert f(3, 4).shape == (3, 4)
-    assert _check_stack_trace(f)
-
-
-def test_rebroadcast():
-    d = np.random.rand(10, 10).astype("float32")
-    v = fmatrix()
-    up = at.unbroadcast(v.sum().dimshuffle("x", "x"), 0, 1)
-    f = aesara.function([v], [up], mode=mode_with_gpu)
-
-    f(d)
-
-    topo = f.maker.fgraph.toposort()
-    rebrs = [node for node in topo if isinstance(node.op, Rebroadcast)]
-    assert len(rebrs) == 1
-    rebr = rebrs[0]
-
-    assert isinstance(rebr.inputs[0].type, GpuArrayType)
-    assert isinstance(rebr.outputs[0].type, GpuArrayType)
-    assert _check_stack_trace(f)
-
-
-class TestSpecifyShape(TestSpecifyShape):
-    mode = mode_with_gpu
-    input_type = GpuArrayType
-
-
-class TestGpuIfelse(TestIfelse):
-    mode = mode_with_gpu
-
-    @staticmethod
-    def cast_output(v):
-        return basic_ops.as_gpuarray_variable(v, test_ctx_name)
-
-    shared = staticmethod(gpuarray_shared_constructor)
-
-    def get_ifelse(self, n):
-        return aesara.ifelse.IfElse(n, gpu=True, as_view=True)
-
-    def test_lifter_with_inputs_of_graph(self):
-        x = vector()
-        cond = iscalar()
-        f = aesara.function(
-            [x, cond], aesara.ifelse.ifelse(cond, x.mean(), x.sum()), mode=mode_with_gpu
-        )
-        assert f(np.float32([1, 2, 3]), 0) == 6
-        assert _check_stack_trace(f)
-
-        x = vector()
-        cond = scalar()
-        f = aesara.function(
-            [x, cond], aesara.ifelse.ifelse(cond, x.mean(), x.sum()), mode=mode_with_gpu
-        )
-        assert f(np.float32([1, 2, 3]), 0) == 6
-        assert _check_stack_trace(f)
-
-    def test_lifter_with_shared_var(self):
-        x = lscalar("x")
-        y = gpuarray_shared_constructor(
-            np.asarray(1, dtype="float32"), target=test_ctx_name
-        )
-        z = at.constant(2.0)
-
-        a = aesara.ifelse.ifelse(x, y, z)
-        with config.change_flags(on_opt_error="raise"):
-            aesara.function([x], [a], mode=mode_with_gpu)
-
-
-def test_print_op():
-    # Test that print ops don't block gpu optimization
-    b = fmatrix()
-    f = aesara.function([b], aesara.printing.Print()(b) * 2, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert isinstance(topo[0].op, GpuFromHost)
-    assert isinstance(topo[1].op, aesara.printing.Print)
-    assert isinstance(topo[2].op, GpuElemwise)
-    assert topo[3].op == host_from_gpu
-    assert _check_stack_trace(f)
-    f(np.random.random((5, 5)).astype("float32"))
-
-
-def test_pdbbreakpoint_op():
-    # Test that PdbBreakpoint ops don't block gpu optimization
-    b = fmatrix()
-
-    # Create a function composed of a breakpoint followed by
-    # some computation
-    condition = gt(b.sum(), 0)
-    b_monitored = PdbBreakpoint(name="TestBreakpoint")(condition, b)
-    output = b_monitored**2
-
-    f = aesara.function([b], output, mode=mode_with_gpu)
-
-    # Ensure that, in the compiled function, the computation following the
-    # breakpoint has been moved to the gpu.
-    topo = f.maker.fgraph.toposort()
-    assert isinstance(topo[-2].op, GpuElemwise)
-    assert topo[-1].op == host_from_gpu
-    assert _check_stack_trace(f)
-
-
-def test_local_gpu_elemwise_careduce():
-    mode_with_gpu_no_cudnn = mode_with_gpu.excluding("cudnn")
-    x = matrix()
-
-    def fn_sum_square(x, axis):
-        return (x * x).sum(axis=axis)
-
-    def fn_sum_abs(x, axis):
-        return abs(x).sum(axis=axis)
-
-    def fn_max_abs(x, axis):
-        return abs(x).max(axis=axis)
-
-    for fn, pre_scalar_op in (
-        (fn_sum_square, aesara.scalar.sqr),
-        (fn_sum_abs, aesara.scalar.abs_),
-        (fn_max_abs, aesara.scalar.abs_),
-    ):
-        for axis in (None, 0, 1):
-            o = fn(x, axis)
-            f = aesara.function([x], o, mode=mode_with_gpu_no_cudnn)
-            topo = f.maker.fgraph.toposort()
-            assert len(topo) == 3
-            assert isinstance(topo[1].op, GpuCAReduceCuda)
-            assert topo[1].op.pre_scalar_op == pre_scalar_op
-            assert _check_stack_trace(f)
-            data = np.random.rand(3, 4).astype(config.floatX)
-            utt.assert_allclose(fn(data, axis), f(data))
-
-
-def test_local_lift_dot22scalar():
-    x = matrix()
-    y = matrix()
-    a = scalar()
-    o = aesara.tensor.blas.Dot22Scalar()(x, y, a)
-    f_cpu = aesara.function([x, y, a], o)
-    f_gpu = aesara.function([x, y, a], o, mode=mode_with_gpu)
-    assert not any(
-        isinstance(n.op, aesara.tensor.blas.Dot22Scalar)
-        for n in f_gpu.maker.fgraph.apply_nodes
-    )
-    assert any(isinstance(n.op, GpuGemm) for n in f_gpu.maker.fgraph.apply_nodes)
-    x_val = np.random.random((2, 3)).astype(config.floatX)
-    y_val = np.random.random((3, 4)).astype(config.floatX)
-    a_val = 0.5
-    utt.assert_allclose(f_cpu(x_val, y_val, a_val), f_gpu(x_val, y_val, a_val))
-    assert _check_stack_trace(f_gpu)
-
-
-def test_local_gpu_subtensor():
-    # Test shared forced on CPU.
-    t = aesara.shared(np.zeros(20, "float32"))
-    f = aesara.function([], t[3:4], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
-    assert not any(isinstance(node.op, GpuSubtensor) for node in topo)
-    assert _check_stack_trace(f)
-
-    # Test graph input.
-    t = fmatrix()
-    f = aesara.function([t], t[3:4], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
-    assert not any(isinstance(node.op, GpuSubtensor) for node in topo)
-    assert _check_stack_trace(f)
-
-    # Test multiple use of the input
-    # We want the subtensor to be on the GPU to prevent multiple transfer.
-    t = fmatrix()
-    f = aesara.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert not any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
-    assert any(isinstance(node.op, GpuSubtensor) for node in topo)
-    assert _check_stack_trace(f)
-
-    # Test multiple use of the input + input as output
-    # We want the subtensor to be on the GPU to prevent multiple transfer.
-    t = fmatrix()
-    f = aesara.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert not any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
-    assert any(isinstance(node.op, GpuSubtensor) for node in topo)
-    assert _check_stack_trace(f)
-
-    # Test shared forced on CPU end we do computation on the output of
-    # the subtensor.
-    t = aesara.shared(np.zeros(20, "float32"))
-    f = aesara.function([], t[3:4] + 1, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert any(type(node.op) is aesara.tensor.subtensor.Subtensor for node in topo)
-    assert not any(isinstance(node.op, GpuSubtensor) for node in topo)
-    # Our optimizer isn't smart enough to move to the GPU Elemwise.
-    # If it where just a little bit smarter, it could wrongly move it to the GPU.
-    # If it where super smart, it would know it should not move it to the GPU.
-    assert any(isinstance(node.op, aesara.tensor.elemwise.Elemwise) for node in topo)
-    assert _check_stack_trace(f)
-
-
-def test_local_gpu_elemwise():
-    # Test local_gpu_elemwise when there is a dtype upcastable to float32
-
-    a = bmatrix()
-    b = fmatrix()
-    c = fmatrix()
-
-    a_v = (np.random.rand(4, 5) * 10).astype("int8")
-    b_v = (np.random.rand(4, 5) * 10).astype("float32")
-    c_v = (np.random.rand(4, 5) * 10).astype("float32")
-
-    # Due to optimization order, this composite is created when all
-    # the op are on the gpu.
-    f = aesara.function([a, b, c], a + b + c, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
-    assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
-    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
-    assert _check_stack_trace(f)
-
-    # Now test with the composite already on the cpu before we move it
-    # to the gpu
-    a_s = aesara.scalar.int8()
-    b_s = aesara.scalar.float32()
-    c_s = aesara.scalar.float32()
-    out_s = aesara.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
-    out_op = aesara.tensor.elemwise.Elemwise(out_s)
-    f = aesara.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
-    assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
-    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
-    assert _check_stack_trace(f)
-
-    return  # Not yet implemented
-    # Test multiple output
-    a_s = aesara.scalar.float32()
-    a = fmatrix()
-    from aesara.scalar.basic import identity
-
-    out_s = aesara.scalar.Composite(
-        [a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]
-    )
-    outs_op = aesara.tensor.elemwise.Elemwise(out_s)
-    f = aesara.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
-    assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
-    out = f(a_v, b_v, c_v)
-    utt.assert_allclose(out[0], a_v)
-    utt.assert_allclose(out[1], c_v)
-    utt.assert_allclose(out[2], b_v)
-    assert _check_stack_trace(f)
-
-    # Test multiple output
-    out_s = aesara.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
-    outs_op = aesara.tensor.elemwise.Elemwise(out_s)
-    f = aesara.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
-    assert sum(type(node.op) == aesara.tensor.elemwise.Elemwise for node in topo) == 0
-    out = f(a_v, b_v, c_v)
-    utt.assert_allclose(out[0], a_v + b_v)
-    utt.assert_allclose(out[1], a_v * c_v)
-    assert _check_stack_trace(f)
-
-    # Test non-contiguous input
-    c = gpuarray_shared_constructor(np.asarray(c_v, dtype="float32"))
-    f = aesara.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu)
-    out = f(a_v, b_v)
-    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
-    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
-    assert _check_stack_trace(f)
-
-
-def test_many_arg_elemwise():
-    # This test checks whether the + and * elemwise ops can handle
-    # extremely large numbers of arguments on gpu.
-
-    rng = np.random.default_rng([1, 2, 3])
-    nb_of_inputs_overflows = []
-    for num_args in [64]:
-        for op_to_test in [aesara.tensor.add, aesara.tensor.mul]:
-            for nb_dim in [2, 8]:
-                shapes = [rng.integers(1, 5) for i in range(nb_dim)]
-                args = [
-                    np.cast["float32"](rng.standard_normal(shapes))
-                    for arg in range(0, num_args)
-                ]
-
-                symb_args = [
-                    TensorType("float32", (False,) * nb_dim)()
-                    for arg in range(0, num_args)
-                ]
-
-                outputs = []
-                for mode in [mode_with_gpu, mode_without_gpu]:
-                    # test the optimization local_gpua_elemwise
-                    output = op_to_test(*symb_args)
-                    f = aesara.function(symb_args, output, mode=mode)
-                    outputs.append(f(*args))
-
-                    # assert that the test was done on the gpu.
-                    if mode is mode_with_gpu:
-                        nb_of_inputs_overflows.append(
-                            max_inputs_to_GpuElemwise(output.owner) - num_args
-                        )
-                        nodelst = [node for node in f.maker.fgraph.apply_nodes]
-                        assert any(isinstance(node.op, GpuElemwise) for node in nodelst)
-                        assert not any(
-                            isinstance(node.op, Elemwise)
-                            for node in nodelst
-                            if not isinstance(node.op, GpuElemwise)
-                        )
-                results_gpu, results_cpu = outputs
-                utt.assert_allclose(results_gpu, results_cpu)
-
-    # Make sure we test at least one case with no number of inputs overflow
-    assert any(overflow >= 0 for overflow in nb_of_inputs_overflows)
-
-    # Make sure we test at least one case with number of inputs overflow
-    assert any(overflow < 0 for overflow in nb_of_inputs_overflows)
-
-
-def test_not_useless_scalar_gpuelemwise():
-    # We don't want to move elemwise on scalar on the GPU when the
-    # result will not be used on the GPU!
-
-    with config.change_flags(warn_float64="ignore"):
-        X = fmatrix()
-        x = np.random.standard_normal((32, 32)).astype(np.float32)
-        m1 = aesara.shared(np.random.standard_normal((32, 32)).astype(np.float32))
-        loss = (X - dot(X, m1)).norm(L=2)
-        lr = aesara.shared(np.asarray(0.001, dtype=np.float32))
-        grad = aesara.grad(loss, m1)
-
-        train = aesara.function(
-            inputs=[X], updates=[(m1, m1 - lr * grad)], mode=mode_with_gpu
-        )
-        train(x)
-        topo = train.maker.fgraph.toposort()
-        gemms = [app for app in topo if isinstance(app.op, GpuGemm)]
-        assert len(gemms) == 2
-        assert isinstance(gemms[1].inputs[1].owner.op, aesara.tensor.elemwise.Elemwise)
-
-
-def test_local_lift_abstractconv_gpu_shape():
-    with config.change_flags(on_opt_error="raise"):
-        s = ivector()
-        a = ftensor4()
-        b = ftensor4()
-        c = aesara.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights()(a, b, s)
-        f = aesara.function([s, a, b], c, mode=mode_with_gpu)
-        assert _check_stack_trace(f)
-
-
-def test_local_assert_no_cpu_op():
-    rng = np.random.default_rng(utt.fetch_seed())
-    m = rng.uniform(-1, 1, (10, 10)).astype("float32")
-    ms = gpuarray_shared_constructor(m, name="m_shared")
-    out = tanh(ms).dot(ms.T)
-
-    mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
-    mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
-
-    with config.change_flags(assert_no_cpu_op="raise", on_opt_error="ignore"):
-        with pytest.raises(AssertionError):
-            aesara.function([], out, mode=mode_local_assert)
-
-    with config.change_flags(assert_no_cpu_op="ignore"):
-        f = aesara.function([], out, mode=mode_local_assert)
-        assert _check_stack_trace(f)
-
-
-def test_no_complex():
-    width_var = cscalar()
-    freq_var = fscalar()
-    signal_var = fscalar()
-    stft_out = exp(width_var * freq_var) * signal_var
-    f = aesara.function([width_var, freq_var, signal_var], stft_out, mode=mode_with_gpu)
-    assert _check_stack_trace(f)
-
-
-@utt.assertFailure_fast
-@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
-def test_local_lift_solve():
-    A = fmatrix()
-    b = fmatrix()
-    o = slinalg.solve(A, b)
-    f_cpu = aesara.function([A, b], o, mode_without_gpu)
-    f_gpu = aesara.function([A, b], o, mode=mode_with_gpu)
-    assert not any(
-        isinstance(n.op, slinalg.Solve) for n in f_gpu.maker.fgraph.apply_nodes
-    )
-    assert any(
-        isinstance(n.op, GpuCusolverSolve) and n.op.inplace
-        for n in f_gpu.maker.fgraph.apply_nodes
-    )
-    A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-    b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
-    utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
-    assert _check_stack_trace(f_gpu)
-
-
-@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
-def test_gpu_solve_not_inplace():
-    A = fmatrix()
-    b = fmatrix()
-    s = slinalg.solve(A, b)
-    o = dot(A, s)
-    f_cpu = aesara.function([A, b], o, mode_without_gpu)
-    f_gpu = aesara.function([A, b], o, mode=mode_with_gpu)
-    count_not_inplace = len(
-        [
-            n.op
-            for n in f_gpu.maker.fgraph.apply_nodes
-            if isinstance(n.op, GpuCusolverSolve) and not n.op.inplace
-        ]
-    )
-    assert count_not_inplace == 1, count_not_inplace
-    A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-    b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
-    utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
-
-
-@utt.assertFailure_fast
-@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
-def test_local_lift_cholesky():
-    A = fmatrix()
-    o = slinalg.cholesky(A)
-    f_cpu = aesara.function([A], o, mode=mode_without_gpu)
-    f_gpu = aesara.function([A], o, mode=mode_with_gpu)
-    assert not any(
-        isinstance(n.op, slinalg.Cholesky) for n in f_gpu.maker.fgraph.apply_nodes
-    )
-    # GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
-    assert any(
-        isinstance(n.op, GpuCholesky) and n.op.inplace
-        for n in f_gpu.maker.fgraph.apply_nodes
-    )
-    M_val = np.random.normal(size=(3, 3)).astype("float32")
-    # A = M.dot(M) will be positive definite for all non-singular M
-    A_val = M_val.dot(M_val.T)
-    utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
-
-
-@pytest.mark.skipif(not cusolver_available, reason="No cuSolver or SciPy")
-def test_gpu_cholesky_not_inplace():
-    A = fmatrix()
-    A_squared = A**2
-    B = slinalg.cholesky(A_squared)
-    D = B + A_squared
-    f_cpu = aesara.function([A], D, mode=mode_without_gpu)
-    f_gpu = aesara.function([A], D, mode=mode_with_gpu)
-    # GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
-    count_cholesky_not_inplace = len(
-        [
-            n.op
-            for n in f_gpu.maker.fgraph.apply_nodes
-            if isinstance(n.op, GpuCholesky) and not n.op.inplace
-        ]
-    )
-    assert count_cholesky_not_inplace == 1, count_cholesky_not_inplace
-    M_val = np.random.normal(size=(3, 3)).astype("float32")
-    # A = M.dot(M) will be positive definite for all non-singular M
-    A_val = M_val.dot(M_val.T)
-    utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
-
-
-def test_local_gpua_advanced_incsubtensor():
-    # test a corner case reported at gh-5589
-    target = ftensor4()
-    y = target.dimshuffle(1, 0, 2, 3).flatten(ndim=1)
-    w = at.ones_like(y)
-    w = aesara.tensor.subtensor.set_subtensor(w[eq(y, 1.0).nonzero()], 100)
-    w = aesara.tensor.subtensor.set_subtensor(w[eq(y, -1.0).nonzero()], 0)
-    f = aesara.function([target], w)
-    assert _check_stack_trace(f)
-
-
-def test_batched_dot_lifter():
-    # The CPU Op accepts 2D and 3D inputs, as well as mixed dtypes.
-    # Make sure the lifter adds the appropriate dimshuffles and casts
-    rng = np.random.default_rng(utt.fetch_seed())
-
-    def randX(*args):
-        return rng.random(args).astype(config.floatX)
-
-    cases = [
-        (randX(3, 5, 7), randX(3, 7)),
-        (randX(3, 5), randX(3, 5, 7)),
-        (randX(3, 5), randX(3, 5)),
-        (rng.random((3, 5, 7)).astype("float32"), randX(3, 7, 9)),
-        (rng.random((3, 5, 7)).astype("float64"), randX(3, 7, 9)),
-    ]
-    for x_val, y_val in cases:
-        x = TensorType(broadcastable=[s == 1 for s in x_val.shape], dtype=x_val.dtype)(
-            "x"
-        )
-        y = TensorType(broadcastable=[s == 1 for s in y_val.shape], dtype=y_val.dtype)(
-            "y"
-        )
-        z = batched_dot(x, y)
-        f = aesara.function([x, y], z, mode=mode_with_gpu)
-        f(x_val, y_val)
-        assert check_stack_trace(f, ops_to_check="all")
-
-
-def test_crossentropycategorical1hot_lifter():
-    rng = np.random.default_rng(utt.fetch_seed())
-    x = matrix()
-    y = lvector()
-    z = aesara.tensor.nnet.crossentropy_categorical_1hot(x, y)
-    gx = aesara.grad(z.mean(), x)
-    f = aesara.function([x, y], [z, gx], mode=mode_with_gpu)
-    assert not any(
-        isinstance(
-            n.op,
-            (
-                aesara.tensor.nnet.CrossentropyCategorical1Hot,
-                aesara.tensor.nnet.CrossentropyCategorical1HotGrad,
-            ),
-        )
-        for n in f.maker.fgraph.apply_nodes
-    )
-    f(
-        rng.uniform(0.1, 0.9, (13, 5)).astype(config.floatX),
-        rng.integers(5, size=(13,)),
-    )
-
-
-class TestConv_opt:
-    def optimizer_2d(
-        self,
-        input_shapes,
-        direction,
-        include_tags,
-        exclude_tags,
-        op,
-        border_mode="valid",
-        subsample=(1, 1),
-        filter_dilation=(1, 1),
-        num_groups=1,
-        unshared=False,
-        optimiser=None,
-    ):
-
-        inp1 = aesara.shared(np.random.random(input_shapes[0]).astype(config.floatX))
-        inp2 = aesara.shared(np.random.random(input_shapes[1]).astype(config.floatX))
-        if op is None:
-            inp1 = basic_ops.as_gpuarray_variable(inp1, test_ctx_name)
-            inp2 = basic_ops.as_gpuarray_variable(inp2, test_ctx_name)
-        if direction == 0:
-            conv_op = abstract_conv.AbstractConv2d(
-                input_shapes[0],
-                input_shapes[1],
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-                num_groups=num_groups,
-                unshared=unshared,
-            )(inp1, inp2)
-
-        if direction == 1:
-            conv_op = abstract_conv.AbstractConv2d_gradWeights(
-                imshp=input_shapes[0],
-                kshp=input_shapes[2],
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-                num_groups=num_groups,
-                unshared=unshared,
-            )(inp1, inp2, input_shapes[2][-2:])
-
-        if direction == 2:
-            conv_op = abstract_conv.AbstractConv2d_gradInputs(
-                imshp=input_shapes[2],
-                kshp=input_shapes[1],
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-                num_groups=num_groups,
-                unshared=unshared,
-            )(inp2, inp1, input_shapes[2][-2:])
-
-        with config.change_flags(
-            metaopt__optimizer_including=include_tags,
-            metaopt__optimizer_excluding=exclude_tags,
-        ):
-            mode = (
-                mode_with_gpu.including("conv_meta")
-                .excluding("conv_dnn")
-                .excluding("conv_gemm")
-            )
-
-            # All meta optimizer compile a new function. This need to know
-            # the current linker, but this information is not available,
-            # so it use the default mode.
-            if op is None:
-                # No convolutions optimization takes place
-                assert optimiser.transform(None, conv_op.owner) is None
-            else:
-                ref_func = aesara.function([], conv_op, mode=mode_with_gpu)
-                with config.change_flags(mode=mode):
-                    conv_func = aesara.function([], conv_op, mode=mode)
-                assert any(
-                    [
-                        isinstance(node.op, op)
-                        for node in conv_func.maker.fgraph.toposort()
-                    ]
-                )
-                utt.assert_allclose(conv_func(), ref_func())
-
-    def optimizer_3d(
-        self,
-        input_shapes,
-        direction,
-        include_tags,
-        exclude_tags,
-        op,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        filter_dilation=(1, 1, 1),
-        num_groups=1,
-        optimiser=None,
-    ):
-        inp1 = aesara.shared(np.random.random(input_shapes[0]).astype(config.floatX))
-        inp2 = aesara.shared(np.random.random(input_shapes[1]).astype(config.floatX))
-
-        if op is None:
-            inp1 = basic_ops.as_gpuarray_variable(inp1, None)
-            inp2 = basic_ops.as_gpuarray_variable(inp2, None)
-        if direction == 0:
-            conv_op = abstract_conv.AbstractConv3d(
-                input_shapes[0],
-                input_shapes[1],
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-                num_groups=num_groups,
-            )(inp1, inp2)
-
-        if direction == 1:
-            conv_op = abstract_conv.AbstractConv3d_gradWeights(
-                input_shapes[0],
-                input_shapes[2],
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-                num_groups=num_groups,
-            )(inp1, inp2, input_shapes[2][-3:])
-
-        if direction == 2:
-            conv_op = abstract_conv.AbstractConv3d_gradInputs(
-                input_shapes[2],
-                input_shapes[1],
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-                num_groups=num_groups,
-            )(inp2, inp1, input_shapes[2][-3:])
-
-        with config.change_flags(
-            metaopt__optimizer_including=include_tags,
-            metaopt__optimizer_excluding=exclude_tags,
-        ):
-
-            mode = (
-                mode_with_gpu.including("conv_meta")
-                .excluding("conv_dnn")
-                .excluding("conv_gemm")
-            )
-
-            # All meta optimizer compile a new function. This need to know
-            # the current linker, but this information is not available,
-            # so it use the default mode.
-            if op is None:
-                # No convolutions optimization takes place
-                assert optimiser.transform(None, conv_op.owner) is None
-                return
-            elif op != "conv3d2d":
-                with config.change_flags(mode=mode):
-                    conv_func = aesara.function([], conv_op, mode=mode)
-                assert any(
-                    [
-                        isinstance(node.op, op)
-                        for node in conv_func.maker.fgraph.toposort()
-                    ]
-                )
-            else:
-                with config.change_flags(mode=mode):
-                    conv_func = aesara.function(
-                        [], conv_op, mode=mode_with_gpu.including("conv_meta")
-                    )
-            ref_func = aesara.function([], conv_op, mode=mode_with_gpu)
-            utt.assert_allclose(conv_func(), ref_func())
-
-    @pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
-    def test_optimizers_2d(self):
-        imshp2d = [(2, 3, 5, 5), (2, 2, 5, 7), (2, 1, 3, 3)]
-        kshp2d = [(4, 3, 3, 3), (3, 2, 3, 5), (4, 1, 1, 1)]
-        tshp2d = [(2, 4, 3, 3), (2, 3, 3, 3), (2, 4, 3, 3)]
-
-        for imshp, kshp, tshp in zip(imshp2d, kshp2d, tshp2d):
-            # forward passes
-            self.optimizer_2d(
-                [imshp, kshp, tshp], 0, "", "conv_dnn:alternative", blas.GpuCorrMM
-            )
-            self.optimizer_2d(
-                [imshp, kshp, tshp],
-                0,
-                "alternative",
-                "conv_dnn:default",
-                blas.GpuCorrMM_gradWeights,
-            )
-            self.optimizer_2d(
-                [imshp, kshp, tshp], 0, "", "conv_gemm:alternative", dnn.GpuDnnConv
-            )
-            self.optimizer_2d(
-                [imshp, kshp, tshp],
-                0,
-                "alternative",
-                "conv_gemm:default",
-                dnn.GpuDnnConvGradW,
-            )
-            # backwards wrt weights
-            self.optimizer_2d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorrMM_gradWeights,
-            )
-            self.optimizer_2d(
-                [imshp, tshp, kshp],
-                1,
-                "alternative",
-                "conv_dnn:default",
-                blas.GpuCorrMM,
-            )
-            self.optimizer_2d(
-                [imshp, tshp, kshp], 1, "", "conv_gemm:alternative", dnn.GpuDnnConvGradW
-            )
-            self.optimizer_2d(
-                [imshp, tshp, kshp],
-                1,
-                "alternative",
-                "conv_gemm:default",
-                dnn.GpuDnnConv,
-            )
-            # backwards wrt to inputs
-            self.optimizer_2d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorrMM_gradInputs,
-            )
-            self.optimizer_2d(
-                [tshp, kshp, imshp],
-                2,
-                "alternative",
-                "conv_dnn:default",
-                blas.GpuCorrMM,
-            )
-            self.optimizer_2d(
-                [tshp, kshp, imshp], 2, "", "conv_gemm:alternative", dnn.GpuDnnConvGradI
-            )
-            self.optimizer_2d(
-                [tshp, kshp, imshp],
-                2,
-                "alternative",
-                "conv_gemm:default",
-                dnn.GpuDnnConv,
-            )
-
-    @pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
-    def test_optimizers_3d(self):
-        imshp3d = [(2, 3, 5, 5, 5), (2, 2, 5, 7, 5), (2, 1, 3, 3, 3)]
-        kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 5, 3), (4, 1, 1, 1, 1)]
-        tshp3d = [(2, 4, 3, 3, 3), (2, 3, 3, 3, 3), (2, 4, 3, 3, 3)]
-
-        for imshp, kshp, tshp in zip(imshp3d, kshp3d, tshp3d):
-            # forwards passes
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                0,
-                "",
-                "conv_dnn:alternative:conv3d2d",
-                blas.GpuCorr3dMM,
-            )
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                0,
-                "alternative",
-                "conv_dnn:default:conv3d2d",
-                blas.GpuCorr3dMM_gradWeights,
-            )
-            self.optimizer_3d([imshp, kshp, tshp], 0, "conv3d2d", "default", "conv3d2d")
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                0,
-                "alternative",
-                "conv_gemm:default:conv3d2d",
-                dnn.GpuDnnConvGradW,
-            )
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                0,
-                "",
-                "conv_gemm:alternative:conv3d2d",
-                dnn.GpuDnnConv,
-            )
-            # backward pass wrt weight
-            self.optimizer_3d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorr3dMM_gradWeights,
-            )
-            self.optimizer_3d(
-                [imshp, tshp, kshp],
-                1,
-                "alternative",
-                "conv_dnn:default",
-                blas.GpuCorr3dMM,
-            )
-            self.optimizer_3d(
-                [imshp, tshp, kshp],
-                1,
-                "alternative",
-                "conv_gemm:default",
-                dnn.GpuDnnConv,
-            )
-            self.optimizer_3d(
-                [imshp, tshp, kshp], 1, "", "conv_gemm:alternative", dnn.GpuDnnConvGradW
-            )
-
-            # backward pass wrt inputs
-            self.optimizer_3d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorr3dMM_gradInputs,
-            )
-            self.optimizer_3d(
-                [tshp, kshp, imshp],
-                2,
-                "alternative",
-                "conv_dnn:default",
-                blas.GpuCorr3dMM,
-            )
-            self.optimizer_3d(
-                [tshp, kshp, imshp],
-                2,
-                "alternative",
-                "conv_gemm:default",
-                dnn.GpuDnnConv,
-            )
-            self.optimizer_3d(
-                [tshp, kshp, imshp], 2, "", "conv_gemm:alternative", dnn.GpuDnnConvGradI
-            )
-
-    @pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
-    def test_optimizers_non_default(self):
-        # conv2d forward pass with Non-default border_mode and filter_dilation
-        imshp2d = [(2, 3, 5, 5), (4, 2, 5, 5)]
-        kshp2d = [(4, 3, 3, 3), (3, 2, 3, 3)]
-        filter_dilation = [(1, 1), (2, 2)]
-        for imshp, kshp, fdil in zip(imshp2d, kshp2d, filter_dilation):
-            self.optimizer_2d(
-                [imshp, kshp],
-                0,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorrMM,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-            self.optimizer_2d(
-                [imshp, kshp],
-                0,
-                "alternative",
-                "conv_dnn:default",
-                blas.GpuCorrMM_gradInputs,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-            self.optimizer_2d(
-                [imshp, kshp],
-                0,
-                "",
-                "conv_gemm:alternative",
-                dnn.GpuDnnConv,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-            self.optimizer_2d(
-                [imshp, kshp],
-                0,
-                "alternative",
-                "conv_gemm:default",
-                dnn.GpuDnnConvGradI,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-        # conv3d forward pass with Non-default border_mode and filter_dilation
-        imshp3d = [(2, 3, 5, 5, 5), (4, 2, 5, 5, 5)]
-        kshp3d = [(4, 3, 3, 3, 3), (3, 2, 3, 3, 3)]
-        filter_dilation = [(1, 1, 1), (2, 2, 2)]
-        for imshp, kshp, fdil in zip(imshp3d, kshp3d, filter_dilation):
-            self.optimizer_3d(
-                [imshp, kshp],
-                0,
-                "",
-                "conv_dnn:alternative:conv3d2d",
-                blas.GpuCorr3dMM,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-            self.optimizer_3d(
-                [imshp, kshp],
-                0,
-                "alternative",
-                "conv_dnn:default:conv3d2d",
-                blas.GpuCorr3dMM_gradInputs,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-            self.optimizer_3d(
-                [imshp, kshp],
-                0,
-                "",
-                "conv_gemm:alternative:conv3d2d",
-                dnn.GpuDnnConv,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-            self.optimizer_3d(
-                [imshp, kshp],
-                0,
-                "alternative",
-                "conv_gemm:default:conv3d2d",
-                dnn.GpuDnnConvGradI,
-                border_mode="full",
-                filter_dilation=fdil,
-            )
-
-        # test non default num_groups for default optimizers
-        imshp2d = [(2, 6, 5, 5), (2, 4, 5, 5)]
-        kshp2d = [(3, 2, 3, 3), (2, 2, 3, 3)]
-        tshp2d = [(2, 3, 3, 3), (2, 2, 3, 3)]
-        num_groups = [3, 2]
-        for imshp, kshp, tshp, groups in zip(imshp2d, kshp2d, tshp2d, num_groups):
-            # forward pass
-            self.optimizer_2d(
-                [imshp, kshp, tshp],
-                0,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorrMM,
-                num_groups=groups,
-            )
-            self.optimizer_2d(
-                [imshp, kshp, tshp],
-                0,
-                "",
-                "conv_gemm:alternative",
-                dnn.GpuDnnConv,
-                num_groups=groups,
-            )
-            # grad with respect to weights
-            self.optimizer_2d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorrMM_gradWeights,
-                num_groups=groups,
-            )
-            self.optimizer_2d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "conv_gemm:alternative",
-                dnn.GpuDnnConvGradW,
-                num_groups=groups,
-            )
-            # grad with respect to inputs
-            self.optimizer_2d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "conv_dnn:alternative",
-                blas.GpuCorrMM_gradInputs,
-                num_groups=groups,
-            )
-            self.optimizer_2d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "conv_gemm:alternative",
-                dnn.GpuDnnConvGradI,
-                num_groups=groups,
-            )
-
-        # test unshared for default optimizers
-        imshp2d = [(2, 2, 4, 4), (3, 2, 5, 3)]
-        kshp2d = [(2, 2, 2, 2, 3, 3), (2, 3, 1, 2, 3, 3)]
-        tshp2d = [(2, 2, 2, 2), (3, 2, 3, 1)]
-        for imshp, kshp, tshp, groups in zip(imshp2d, kshp2d, tshp2d, num_groups):
-            # forward pass
-            self.optimizer_2d(
-                [imshp, kshp, tshp], 0, "", "alternative", blas.GpuCorrMM, unshared=True
-            )
-            # grad with respect to weights
-            self.optimizer_2d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "alternative",
-                blas.GpuCorrMM_gradWeights,
-                unshared=True,
-            )
-            # grad with respect to inputs
-            self.optimizer_2d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "alternative",
-                blas.GpuCorrMM_gradInputs,
-                unshared=True,
-            )
-
-        imshp3d = [(2, 6, 5, 5, 5), (2, 4, 5, 5, 5)]
-        kshp3d = [(3, 2, 3, 3, 3), (2, 2, 3, 3, 3)]
-        tshp3d = [(2, 3, 3, 3, 3), (2, 2, 3, 3, 3)]
-        num_groups = [3, 2]
-        for imshp, kshp, tshp, groups in zip(imshp3d, kshp3d, tshp3d, num_groups):
-            # forward pass
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                0,
-                "",
-                "conv_dnn:alternative:conv3d2d",
-                blas.GpuCorr3dMM,
-                num_groups=groups,
-            )
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                0,
-                "",
-                "conv_gemm:alternative:conv3d2d",
-                dnn.GpuDnnConv,
-                num_groups=groups,
-            )
-            # grad with respect to weights
-            self.optimizer_3d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "conv_dnn:alternative:conv3d2d",
-                blas.GpuCorr3dMM_gradWeights,
-                num_groups=groups,
-            )
-            self.optimizer_3d(
-                [imshp, tshp, kshp],
-                1,
-                "",
-                "conv_gemm:alternative:conv3d2d",
-                dnn.GpuDnnConvGradW,
-                num_groups=groups,
-            )
-            # grad with respect to inputs
-            self.optimizer_3d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "conv_dnn:alternative:conv3d2d",
-                blas.GpuCorr3dMM_gradInputs,
-                num_groups=groups,
-            )
-            self.optimizer_3d(
-                [tshp, kshp, imshp],
-                2,
-                "",
-                "conv_gemm:alternative:conv3d2d",
-                dnn.GpuDnnConvGradI,
-                num_groups=groups,
-            )
-
-    @pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
-    def test_returns_none_2d(self):
-        # values given don't matter since it returns None
-        imshp = (2, 3, 5, 5)
-        kshp = (4, 3, 3, 3)
-        tshp = (2, 4, 3, 3)
-        conv_direction = [0, 1, 2]
-        optimisers = [
-            [opt.local_abstractconv_gemm_alt, opt.local_abstractconv_cudnn_alt],
-            [
-                opt.local_abstractconv_gemm_gradweights_alt,
-                opt.local_abstractconv_cudnn_alt,
-            ],
-            [
-                opt.local_abstractconv_gradinputs_gemm_alt,
-                opt.local_abstractconv_cudnn_alt,
-            ],
-        ]
-        # test that non default subsample returns None
-        for opt_direction, direction in zip(optimisers, conv_direction):
-            for optimiser in opt_direction:
-                self.optimizer_2d(
-                    [imshp, kshp, tshp],
-                    direction,
-                    "",
-                    "",
-                    None,
-                    subsample=(2, 2),
-                    optimiser=optimiser,
-                )
-        # test that non default num_groups returns None
-        for opt_direction, direction in zip(optimisers, conv_direction):
-            for optimiser in opt_direction:
-                self.optimizer_2d(
-                    [imshp, kshp, tshp],
-                    direction,
-                    "",
-                    "",
-                    None,
-                    num_groups=3,
-                    optimiser=optimiser,
-                )
-        # test that border_mode=half returns None
-        for opt_direction, direction in zip(optimisers, conv_direction):
-            for optimiser in opt_direction:
-                self.optimizer_2d(
-                    [imshp, kshp, tshp],
-                    direction,
-                    "",
-                    "",
-                    None,
-                    border_mode="half",
-                    optimiser=optimiser,
-                )
-        # test that Non-default filter dilation return None for
-        # direction 1
-        for optimiser in optimisers[1]:
-            self.optimizer_2d(
-                [imshp, kshp, tshp],
-                1,
-                "",
-                "",
-                None,
-                filter_dilation=(2, 2),
-                optimiser=optimiser,
-            )
-        imshp = (2, 2, 4, 4)
-        kshp = (2, 2, 2, 2, 3, 3)
-        tshp = (2, 2, 2, 2)
-        shape_perms = [[imshp, kshp, tshp], [imshp, tshp, kshp], [tshp, kshp, imshp]]
-        # test unshared convolution returns None
-        for opt_direction, direction, perms in zip(
-            optimisers, conv_direction, shape_perms
-        ):
-            for optimiser in opt_direction:
-                self.optimizer_2d(
-                    perms, direction, "", "", None, unshared=True, optimiser=optimiser
-                )
-
-    @pytest.mark.skipif(config.cxx == "", reason="Need a c compiler.")
-    def test_returns_none_3d(self):
-        imshp = (2, 3, 5, 5, 5)
-        kshp = (4, 3, 3, 3, 3)
-        tshp = (2, 4, 3, 3, 3)
-        conv_direction = [0, 1, 2]
-        optimisers = [
-            [opt.local_abstractconv3d_alt, opt.local_abstractconv3d_cudnn_alt],
-            [
-                opt.local_abstractconv3d_gemm_gradweights_alt,
-                opt.local_abstractconv3d_cudnn_alt,
-            ],
-            [
-                opt.local_abstractconv3d_gradinputs_gemm_alt,
-                opt.local_abstractconv3d_cudnn_alt,
-            ],
-        ]
-        # test that non default subsample returns None
-        for opt_direction, direction in zip(optimisers, conv_direction):
-            for optimiser in opt_direction:
-                self.optimizer_3d(
-                    [imshp, kshp, tshp],
-                    direction,
-                    "",
-                    "",
-                    None,
-                    subsample=(2, 2, 2),
-                    optimiser=optimiser,
-                )
-        # test that non default num_groups returns None
-        for opt_direction, direction in zip(optimisers, conv_direction):
-            for optimiser in opt_direction:
-                self.optimizer_3d(
-                    [imshp, kshp, tshp],
-                    direction,
-                    "",
-                    "",
-                    None,
-                    num_groups=3,
-                    optimiser=optimiser,
-                )
-        # test that border_mode=half returns None
-        for opt_direction, direction in zip(optimisers, conv_direction):
-            for optimiser in opt_direction:
-                self.optimizer_3d(
-                    [imshp, kshp, tshp],
-                    direction,
-                    "",
-                    "",
-                    None,
-                    border_mode="half",
-                    optimiser=optimiser,
-                )
-        # test that Non-default filter dilation return None for
-        # direction 1
-        for optimiser in optimisers[1]:
-            self.optimizer_3d(
-                [imshp, kshp, tshp],
-                1,
-                "",
-                "",
-                None,
-                filter_dilation=(2, 2, 2),
-                optimiser=optimiser,
-            )
--- a/tests/gpuarray/test_others.py
+++ b/tests/gpuarray/test_others.py
-import numpy as np
-import pytest
-
-
-pygpu = pytest.importorskip("pygpu")
-
-from aesara.gpuarray.basic_ops import GpuFromHost, HostFromGpu
-from aesara.gpuarray.type import (
-    GpuArraySharedVariable,
-    GpuArrayType,
-    get_context,
-    gpuarray_shared_constructor,
-)
-from aesara.misc.pkl_utils import dump, load
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-from tests.misc.test_may_share_memory import may_share_memory_core
-from tests.tensor import test_opt
-
-
-class TestFusion(test_opt.TestFusion):
-    mode = mode_with_gpu.excluding("local_dnn_reduction")
-    _shared = staticmethod(gpuarray_shared_constructor)
-    topo_exclude = (GpuFromHost, HostFromGpu)
-
-
-def test_may_share_memory():
-    ctx = get_context(test_ctx_name)
-    a = pygpu.empty((5, 4), context=ctx)
-    b = pygpu.empty((5, 4), context=ctx)
-
-    may_share_memory_core(a, b)
-
-
-def test_dump_load():
-    x = GpuArraySharedVariable(
-        "x",
-        GpuArrayType("float32", (1, 1), name="x", context_name=test_ctx_name),
-        [[1]],
-        False,
-    )
-
-    with open("test", "wb") as f:
-        dump(x, f)
-
-    with open("test", "rb") as f:
-        x = load(f)
-
-    assert x.name == "x"
-    np.testing.assert_allclose(x.get_value(), [[1]])
--- a/tests/gpuarray/test_pickle.py
+++ b/tests/gpuarray/test_pickle.py
-"""
-Some pickle test when pygpu isn't there. The test when pygpu is
-available are in test_type.py.
-
-This is needed as we skip all the test file when pygpu isn't there in
-regular test file.
-"""
-
-import os
-import sys
-from pickle import Unpickler
-
-import numpy as np
-import pytest
-
-from aesara.configdefaults import config
-from aesara.gpuarray.type import ContextNotDefined
-
-
-try:
-    import pygpu  # noqa: F401
-
-    have_pygpu = True
-except ImportError:
-    have_pygpu = False
-
-
-@pytest.mark.skip(reason="These tests relied on saved/versioned pickled files.")
-@pytest.mark.skipif(have_pygpu, reason="pygpu active")
-def test_unpickle_gpuarray_as_numpy_ndarray_flag1():
-    oldflag = config.experimental__unpickle_gpu_on_cpu
-    config.experimental__unpickle_gpu_on_cpu = False
-
-    try:
-        testfile_dir = os.path.dirname(os.path.realpath(__file__))
-        fname = "GpuArray.pkl"
-
-        with open(os.path.join(testfile_dir, fname), "rb") as fp:
-            u = Unpickler(fp, encoding="latin1")
-            with pytest.raises((ImportError, ContextNotDefined)):
-                u.load()
-    finally:
-        config.experimental__unpickle_gpu_on_cpu = oldflag
-
-
-@pytest.mark.skip(reason="These tests relied on saved/versioned pickled files.")
-def test_unpickle_gpuarray_as_numpy_ndarray_flag2():
-    oldflag = config.experimental__unpickle_gpu_on_cpu
-    config.experimental__unpickle_gpu_on_cpu = True
-
-    try:
-        testfile_dir = os.path.dirname(os.path.realpath(__file__))
-        fname = "GpuArray.pkl"
-
-        with open(os.path.join(testfile_dir, fname), "rb") as fp:
-            u = Unpickler(fp, encoding="latin1")
-            try:
-                mat = u.load()
-            except ImportError:
-                # Windows sometimes fail with nonsensical errors like:
-                #   ImportError: No module named type
-                #   ImportError: No module named copy_reg
-                # when "type" and "copy_reg" are builtin modules.
-                if sys.platform == "win32":
-                    exc_type, exc_value, exc_trace = sys.exc_info()
-                    raise
-                raise
-
-        assert isinstance(mat, np.ndarray)
-        assert mat[0] == -42.0
-
-    finally:
-        config.experimental__unpickle_gpu_on_cpu = oldflag
--- a/tests/gpuarray/test_pool.py
+++ b/tests/gpuarray/test_pool.py
-import copy
-import itertools
-
-import numpy as np
-import pytest
-
-import aesara
-from aesara import tensor as at
-from aesara.gpuarray.pool import (
-    GpuAveragePoolGrad,
-    GpuDownsampleFactorMaxGradGrad,
-    GpuMaxPoolGrad,
-    GpuPool,
-)
-from aesara.gradient import Lop, Rop, grad
-from aesara.tensor.signal.pool import (
-    AveragePoolGrad,
-    DownsampleFactorMaxGradGrad,
-    MaxPoolGrad,
-    Pool,
-)
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
-from tests.gpuarray.test_basic_ops import random
-
-
-class TestPool:
-    def test_pool_py_interface(self):
-        shp = (2, 2, 2, 2)
-        inp = aesara.shared(random(*shp), "a")
-        inp = at.as_tensor_variable(inp)
-        with pytest.raises(ValueError):
-            # test when pad >= ws
-            ds_op = GpuPool(ignore_border=True, ndim=2)
-            ds_op(inp, [2, 2], pad=[3, 3])
-        with pytest.raises(ValueError):
-            # test when ignore_border and pad >= 0
-            ds_op = GpuPool(ignore_border=False, ndim=2)
-            ds_op(inp, [2, 2], pad=[1, 1])
-
-    def test_pool_c_interface(self):
-        gpu_mode = mode_with_gpu.excluding("cudnn")
-        gpu_mode.check_py_code = False
-
-        shp = (2, 2, 2, 2)
-        inp = aesara.shared(random(*shp), "a")
-        inp = at.as_tensor_variable(inp)
-        with pytest.raises(ValueError):
-            # test when ignore_border and pad >= 0
-            ds_op = GpuPool(ignore_border=False, ndim=2)
-            pad = at.as_tensor_variable([1, 1])
-            f = aesara.function([], ds_op(inp, [2, 2], pad=pad), mode=gpu_mode)
-            f()
-
-    def test_pool_big_ws(self):
-        gpu_mode = mode_with_gpu.excluding("cudnn")
-        gpu_mode.check_py_code = False
-
-        shp = (2, 2, 2, 2)
-        inp = aesara.shared(random(*shp), "a")
-        inp = at.as_tensor_variable(inp)
-        ds_op = GpuPool(ignore_border=False, mode="average_exc_pad", ndim=2)
-        pad = at.as_tensor_variable([0, 0])
-        f = aesara.function(
-            [], ds_op(inp, [5, 5], stride=[1, 1], pad=pad), mode=gpu_mode
-        )
-        f()
-
-
-def test_pool2d():
-    shps = [
-        (1, 12),
-        (1, 1, 12),
-        (1, 1, 1, 12),
-        (1, 1, 2, 2),
-        (1, 1, 1, 1),
-        (1, 1, 4, 4),
-        (1, 1, 10, 11),
-        (1, 2, 2, 2),
-        (3, 5, 4, 4),
-        (25, 1, 7, 7),
-        (1, 1, 12, 12),
-        (1, 1, 2, 14),
-        (1, 1, 12, 14),
-        (1, 1, 14, 14),
-        (1, 1, 16, 16),
-        (1, 1, 18, 18),
-        (1, 1, 24, 24),
-        (1, 6, 24, 24),
-        (10, 1, 24, 24),
-        (10, 6, 24, 24),
-        (30, 6, 12, 12),
-        (30, 2, 24, 24),
-        (30, 6, 24, 24),
-        (10, 10, 10, 11),
-        (1, 1, 10, 1025),
-        (1, 1, 10, 1023),
-        (1, 1, 1025, 10),
-        (1, 1, 1023, 10),
-        (3, 2, 16, 16, 16),
-        (3, 2, 6, 6, 6, 5),
-        (3, 2, 6, 6, 6, 5, 7),
-    ]
-
-    np.random.default_rng(utt.fetch_seed()).shuffle(shps)
-    test_ws = (2, 2), (3, 2), (1, 1)
-    test_st = (2, 2), (3, 2), (1, 1)
-    test_mode = ["max", "sum", "average_inc_pad", "average_exc_pad"]
-
-    ref_mode = copy.copy(mode_without_gpu)
-    ref_mode.check_py_code = False
-    gpu_mode = mode_with_gpu.excluding("cudnn")
-    gpu_mode.check_py_code = False
-
-    for shp in shps:
-        for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
-            if ws[0] > shp[-2] or ws[1] > shp[-1]:
-                continue
-            for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]):
-                if pad[0] >= ws[0] or pad[1] >= ws[1]:
-                    continue
-                if mode == "average_exc_pad" and (pad[0] > 0 or pad[1] > 0):
-                    continue
-                # print('test_pool2d', shp, ws, st, pad, mode, ignore_border)
-                ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
-
-                a = aesara.shared(random(*shp), "a")
-                a_pooled = ds_op(at.as_tensor_variable(a), ws, st, pad)
-
-                f = aesara.function([], a_pooled, mode=gpu_mode)
-                f2 = aesara.function([], a_pooled, mode=ref_mode)
-
-                assert any(
-                    [isinstance(node.op, GpuPool) for node in f.maker.fgraph.toposort()]
-                )
-                assert any(
-                    [isinstance(node.op, Pool) for node in f2.maker.fgraph.toposort()]
-                )
-                assert np.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
-
-                a_pooled_grad = grad(a_pooled.sum(), a)
-
-                g = aesara.function([], a_pooled_grad, mode=gpu_mode)
-                g2 = aesara.function([], a_pooled_grad, mode=ref_mode)
-
-                if mode == "max":
-                    gop = GpuMaxPoolGrad
-                    gop2 = MaxPoolGrad
-                else:
-                    gop = GpuAveragePoolGrad
-                    gop2 = AveragePoolGrad
-                assert any(
-                    [isinstance(node.op, gop) for node in g.maker.fgraph.toposort()]
-                )
-                assert any(
-                    [isinstance(node.op, gop2) for node in g2.maker.fgraph.toposort()]
-                )
-
-                assert np.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
-
-                # test rop and grad grad for max pooling
-                # for average pooling grad grad is just average pooling grad
-                if mode != "max":
-                    continue
-
-                ea = aesara.shared(random(*shp), "ea")
-
-                gr = aesara.function([], Rop(a_pooled, a, ea), mode=gpu_mode)
-                gr2 = aesara.function([], Rop(a_pooled, a, ea), mode=ref_mode)
-
-                assert any(
-                    [
-                        isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
-                        for node in gr.maker.fgraph.toposort()
-                    ]
-                )
-                assert any(
-                    [
-                        isinstance(node.op, DownsampleFactorMaxGradGrad)
-                        for node in gr2.maker.fgraph.toposort()
-                    ]
-                )
-                assert np.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
-
-                ggf = Lop(grad((a_pooled**2).sum(), a), a, a)
-
-                gg = aesara.function([], ggf, mode=gpu_mode)
-                gg2 = aesara.function([], ggf, mode=ref_mode)
-
-                assert any(
-                    [
-                        isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
-                        for node in gg.maker.fgraph.toposort()
-                    ]
-                )
-                assert any(
-                    [
-                        isinstance(node.op, DownsampleFactorMaxGradGrad)
-                        for node in gg2.maker.fgraph.toposort()
-                    ]
-                )
-                assert np.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
-
-
-def test_pool3d():
-    shps = [
-        (1, 1, 12),
-        (1, 1, 1, 1, 1),
-        (1, 1, 1, 1, 1025),
-        (1, 1, 2, 2, 2),
-        (1, 1, 7, 7, 7),
-        (1, 1, 9, 10, 11),
-        (1, 6, 18, 18, 18),
-        (1, 1, 6, 24, 24),
-        (1, 10, 1, 24, 24),
-        (1, 10, 6, 24, 24),
-        (1, 30, 6, 12, 12),
-        (1, 30, 2, 24, 24),
-        (1, 30, 6, 24, 24),
-        (1, 10, 10, 10, 11),
-        (1, 1, 10, 10, 1025),
-        (1, 1, 10, 10, 1023),
-        (1, 1, 10, 1025, 10),
-        (1, 1, 10, 1023, 10),
-        (3, 2, 6, 6, 6, 5),
-        (3, 2, 6, 6, 6, 5, 7),
-    ]
-
-    np.random.default_rng(utt.fetch_seed()).shuffle(shps)
-    test_ws = (2, 2, 2), (3, 2, 3), (1, 1, 1)
-    test_st = (2, 2, 2), (2, 3, 2), (1, 1, 1)
-    test_mode = ["max", "sum", "average_inc_pad", "average_exc_pad"]
-
-    ref_mode = copy.copy(mode_without_gpu)
-    ref_mode.check_py_code = False
-    gpu_mode = mode_with_gpu.excluding("cudnn")
-    gpu_mode.check_py_code = False
-
-    for shp in shps:
-        for mode, ws, st in itertools.product(test_mode, test_ws, test_st):
-            if ws[0] > shp[-3] or ws[1] > shp[-2] or ws[2] > shp[-1]:
-                continue
-            for ignore_border, pad in zip((True, False), [(1, 1, 1), (0, 0, 0)]):
-                if pad[0] >= ws[0] or pad[1] >= ws[1] or pad[2] >= ws[2]:
-                    continue
-                if mode == "average_exc_pad" and (
-                    pad[0] > 0 or pad[1] > 0 or pad[2] > 0
-                ):
-                    continue
-                # print('test_pool3d', shp, ws, st, pad, mode, ignore_border)
-                ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border)
-
-                a = aesara.shared(random(*shp), "a")
-                a_pooled = ds_op(at.as_tensor_variable(a), ws, st, pad)
-
-                f = aesara.function([], a_pooled, mode=gpu_mode)
-                f2 = aesara.function([], a_pooled, mode=ref_mode)
-
-                assert any(
-                    [isinstance(node.op, GpuPool) for node in f.maker.fgraph.toposort()]
-                )
-                assert any(
-                    [isinstance(node.op, Pool) for node in f2.maker.fgraph.toposort()]
-                )
-                assert np.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border)
-
-                a_pooled_grad = grad(a_pooled.sum(), a)
-
-                g = aesara.function([], a_pooled_grad, mode=gpu_mode)
-                g2 = aesara.function([], a_pooled_grad, mode=ref_mode)
-
-                if mode == "max":
-                    gop = GpuMaxPoolGrad
-                    gop2 = MaxPoolGrad
-                else:
-                    gop = GpuAveragePoolGrad
-                    gop2 = AveragePoolGrad
-                assert any(
-                    [isinstance(node.op, gop) for node in g.maker.fgraph.toposort()]
-                )
-                assert any(
-                    [isinstance(node.op, gop2) for node in g2.maker.fgraph.toposort()]
-                )
-
-                assert np.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border)
-
-                # test rop and grad grad for max pooling
-                # for average pooling grad grad is just average pooling grad
-                if mode != "max":
-                    continue
-
-                ea = aesara.shared(random(*shp), "ea")
-
-                gr = aesara.function([], Rop(a_pooled, a, ea), mode=gpu_mode)
-                gr2 = aesara.function([], Rop(a_pooled, a, ea), mode=ref_mode)
-
-                assert any(
-                    [
-                        isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
-                        for node in gr.maker.fgraph.toposort()
-                    ]
-                )
-                assert any(
-                    [
-                        isinstance(node.op, DownsampleFactorMaxGradGrad)
-                        for node in gr2.maker.fgraph.toposort()
-                    ]
-                )
-                assert np.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border)
-
-                ggf = Lop(grad((a_pooled**2).sum(), a), a, a)
-
-                gg = aesara.function([], ggf, mode=gpu_mode)
-                gg2 = aesara.function([], ggf, mode=ref_mode)
-
-                assert any(
-                    [
-                        isinstance(node.op, GpuDownsampleFactorMaxGradGrad)
-                        for node in gg.maker.fgraph.toposort()
-                    ]
-                )
-                assert any(
-                    [
-                        isinstance(node.op, DownsampleFactorMaxGradGrad)
-                        for node in gg2.maker.fgraph.toposort()
-                    ]
-                )
-                assert np.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
--- a/tests/gpuarray/test_reduction.py
+++ b/tests/gpuarray/test_reduction.py
-import math
-
-import numpy as np
-import pytest
-
-import aesara
-import aesara.tensor as at
-from aesara.gpuarray import GpuArrayType
-from aesara.gpuarray.dnn import GpuDnnReduction
-from aesara.gpuarray.reduction import GpuMaxAndArgmax
-from aesara.tensor.math import argmax
-from aesara.tensor.math import max as at_max
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, mode_without_gpu
-from tests.gpuarray.test_basic_ops import rand_gpuarray
-
-
-# Number of values to be used in test tensors (except with 0-D tensors!).
-test_size = 10000
-
-# NB: This order of "unsorted axes" is arbitrary and is here
-# just to have the same information on profile output
-# from one test to another.
-unsorted_axes = (2, 4, 0, 3, 1)
-
-np.random.seed()
-
-
-def numpy_random_array(shapes):
-    size = 1
-    for dimsize in shapes:
-        size *= dimsize
-    return np.random.normal(size=size).astype(aesara.config.floatX).reshape(shapes)
-
-
-def numpy_maxandargmax(X, axis=None):
-    if axis is None:
-        axis = list(range(X.ndim))
-    elif not isinstance(axis, (tuple, list)):
-        axis = [int(axis)]
-    axis = list(set(axis))  # remove duplicated values.
-    axis.sort()
-    axis = tuple(axis)
-    ref_max = np.max(X, axis=axis)
-    # Following code is copied from MaxAndArgmax.perform():
-    # Numpy does not support multiple axes for argmax. Work around.
-    keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype="int64")
-    # Not-reduced axes in front
-    transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
-    kept_shape = transposed_x.shape[: len(keep_axes)]
-    reduced_shape = transposed_x.shape[len(keep_axes) :]
-    new_shape = kept_shape + (np.prod(reduced_shape),)
-    new_shape = tuple(int(i) for i in new_shape)
-    reshaped_x = transposed_x.reshape(new_shape)
-    return (ref_max, np.argmax(reshaped_x, axis=-1))
-
-
-def check_if_gpu_reduce_in_graph(aesara_function):
-    assert any(
-        isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
-        for node in aesara_function.maker.fgraph.apply_nodes
-    )
-
-
-def check_if_gpu_reduce_not_in_graph(aesara_function):
-    assert all(
-        not isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
-        for node in aesara_function.maker.fgraph.apply_nodes
-    )
-
-
-class BaseTest:
-    # This attribute must be set in subclasses.
-    tensor_size = None
-    shape = None
-
-    dtype = aesara.config.floatX
-
-    def get_shape(self):
-        if self.tensor_size == 0:
-            return []
-        return [
-            int(math.ceil(math.pow(test_size, 1 / self.tensor_size)))
-        ] * self.tensor_size
-
-    def setup_method(self):
-        if not isinstance(self.tensor_size, int):
-            pytest.skip("No tensor ndim defined.")
-        if self.tensor_size < 0 or self.tensor_size > 5:
-            pytest.skip(
-                "We allow from 0 (included) to 5 (included) dimensons for these tests."
-            )
-        if self.shape is None:
-            self.shape = self.get_shape()
-
-    def get_host_tensor(self):
-        broadcastable = (False,) * self.tensor_size
-        return at.tensor(self.dtype, broadcastable)
-
-    def get_gpu_tensor(self):
-        broadcastable = (False,) * self.tensor_size
-        return GpuArrayType(self.dtype, broadcastable)()
-
-    def get_host_value(self):
-        return numpy_random_array(self.shape)
-
-    def get_gpu_value(self):
-        return rand_gpuarray(*self.shape)
-
-    # NB: In compute_host() and compute_gpu(),
-    # the first call of the aesara function should be ignored in profiling,
-    # with Aesara config flag profiling__ignore_first_call=True.
-
-    def compute_host(self, test_tensor, axis):
-        M = self.get_host_tensor()
-        f = aesara.function(
-            [M],
-            [at_max(M, axis=axis), argmax(M, axis=axis)],
-            name="shape:" + str(test_tensor.shape) + "/axis:" + str(axis) + "/HOST",
-            mode=mode_without_gpu,
-        )
-        check_if_gpu_reduce_not_in_graph(f)
-        f(test_tensor)
-        aesara_max, aesara_argmax = f(test_tensor)
-        ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
-        utt.assert_allclose(ref_max, aesara_max)
-        utt.assert_allclose(ref_argmax, aesara_argmax)
-
-    def compute_gpu(self, test_gpu_tensor, test_host_tensor, axis):
-        M = self.get_gpu_tensor()
-        f = aesara.function(
-            [M],
-            [at_max(M, axis=axis), argmax(M, axis=axis)],
-            name="shape:" + str(test_gpu_tensor.shape) + "/axis:" + str(axis) + "/GPU",
-            mode=mode_with_gpu,
-        )
-        check_if_gpu_reduce_in_graph(f)
-        f(test_gpu_tensor)
-        aesara_max, aesara_argmax = f(test_gpu_tensor)
-        ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
-        utt.assert_allclose(ref_max, aesara_max)
-        utt.assert_allclose(ref_argmax, aesara_argmax)
-
-    def compute(self, axis=None):
-        # We want to run CPU op and GPU op on the same tensor randomly generated.
-        test_gpu_tensor = self.get_gpu_value()
-        test_host_tensor = np.asarray(test_gpu_tensor)
-        self.compute_host(test_host_tensor, axis)
-        self.compute_gpu(test_gpu_tensor, test_host_tensor, axis)
-
-    def compute_axis(self, pos):
-        if self.tensor_size != 1 and 0 <= pos < self.tensor_size:
-            self.compute(pos)
-
-    def compute_some_axes(self, count):
-        if 0 <= count < self.tensor_size:
-            self.compute([i for i in unsorted_axes if i < self.tensor_size][:count])
-
-    # Equivalent to test reduction on all axes.
-    def test_none(self):
-        self.compute(None)
-
-    def test_axis_1(self):
-        self.compute_axis(0)
-
-    def test_axis_2(self):
-        self.compute_axis(1)
-
-    def test_axis_3(self):
-        self.compute_axis(2)
-
-    def test_axis_4(self):
-        self.compute_axis(3)
-
-    def test_axis_5(self):
-        self.compute_axis(4)
-
-    # For the tests below, we expect CPU op to run with Python implementation.
-
-    def test_2_axes(self):
-        self.compute_some_axes(2)
-
-    def test_3_axes(self):
-        self.compute_some_axes(3)
-
-    def test_4_axes(self):
-        self.compute_some_axes(4)
-
-
-class TestScalar(BaseTest):
-    tensor_size = 0
-
-
-class TestVector(BaseTest):
-    tensor_size = 1
-
-
-# Special case
-class TestRow(BaseTest):
-    tensor_size = 2
-    shape = [1, test_size]
-
-
-# Special case
-class TestColumn(BaseTest):
-    tensor_size = 2
-    shape = [test_size, 1]
-
-
-class TestMatrix(BaseTest):
-    tensor_size = 2
-
-
-class TestTensor5(BaseTest):
-    tensor_size = 5
--- a/tests/gpuarray/test_rng_mrg.py
+++ b/tests/gpuarray/test_rng_mrg.py
-import functools
-
-import numpy as np
-
-import aesara
-from aesara import tensor as at
-from aesara.configdefaults import config
-from aesara.gpuarray.rng_mrg import GPUA_mrg_uniform
-from aesara.gpuarray.type import gpuarray_shared_constructor
-from aesara.sandbox import rng_mrg
-from aesara.sandbox.rng_mrg import MRG_RandomStream
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu as mode
-from tests.sandbox.test_rng_mrg import java_samples, rng_mrg_overflow
-from tests.sandbox.test_rng_mrg import test_f16_nonzero as cpu_f16_nonzero
-
-
-def test_consistency_GPUA_serial():
-    # Verify that the random numbers generated by GPUA_mrg_uniform, serially,
-    # are the same as the reference (Java) implementation by L'Ecuyer et al.
-
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7
-
-    samples = []
-    curr_rstate = np.array([seed] * 6, dtype="int32")
-
-    for i in range(n_streams):
-        stream_rstate = curr_rstate.copy()
-        for j in range(n_substreams):
-            substream_rstate = np.array([stream_rstate.copy()], dtype="int32")
-            # Transfer to device
-            rstate = gpuarray_shared_constructor(substream_rstate)
-
-            new_rstate, sample = GPUA_mrg_uniform.new(
-                rstate, ndim=None, dtype="float32", size=(1,)
-            )
-            rstate.default_update = new_rstate
-
-            # Not really necessary, just mimicking
-            # rng_mrg.MRG_RandomStream' behavior
-            sample.rstate = rstate
-            sample.update = (rstate, new_rstate)
-
-            # We need the sample back in the main memory
-            cpu_sample = at.as_tensor_variable(sample)
-            f = aesara.function([], cpu_sample, mode=mode)
-            for k in range(n_samples):
-                s = f()
-                samples.append(s)
-
-            # next substream
-            stream_rstate = rng_mrg.ff_2p72(stream_rstate)
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = np.array(samples).flatten()
-    assert np.allclose(samples, java_samples)
-
-
-def test_consistency_GPUA_parallel():
-    # Verify that the random numbers generated by GPUA_mrg_uniform, in
-    # parallel, are the same as the reference (Java) implementation by
-    # L'Ecuyer et al.
-    seed = 12345
-    n_samples = 5
-    n_streams = 12
-    n_substreams = 7  # 7 samples will be drawn in parallel
-
-    samples = []
-    curr_rstate = np.array([seed] * 6, dtype="int32")
-
-    for i in range(n_streams):
-        stream_samples = []
-        rstate = [curr_rstate.copy()]
-        for j in range(1, n_substreams):
-            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
-        rstate = np.asarray(rstate)
-        rstate = gpuarray_shared_constructor(rstate)
-
-        new_rstate, sample = GPUA_mrg_uniform.new(
-            rstate, ndim=None, dtype="float32", size=(n_substreams,)
-        )
-        rstate.default_update = new_rstate
-
-        # Not really necessary, just mimicking
-        # rng_mrg.MRG_RandomStream' behavior
-        sample.rstate = rstate
-        sample.update = (rstate, new_rstate)
-
-        # We need the sample back in the main memory
-        cpu_sample = at.as_tensor_variable(sample)
-        f = aesara.function([], cpu_sample, mode=mode)
-
-        for k in range(n_samples):
-            s = f()
-            stream_samples.append(s)
-
-        samples.append(np.array(stream_samples).T.flatten())
-
-        # next stream
-        curr_rstate = rng_mrg.ff_2p134(curr_rstate)
-
-    samples = np.array(samples).flatten()
-    assert np.allclose(samples, java_samples)
-
-
-def test_GPUA_full_fill():
-    # Make sure the whole sample buffer is filled.  Also make sure
-    # large samples are consistent with CPU results.
-
-    # This needs to be large to trigger the problem on GPU
-    size = (10, 1000)
-
-    R = MRG_RandomStream(234)
-    uni = R.uniform(size, nstreams=60 * 256)
-    f_cpu = aesara.function([], uni)
-
-    rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
-    new_rstate, sample = GPUA_mrg_uniform.new(
-        rstate_gpu, ndim=None, dtype="float32", size=size
-    )
-    rstate_gpu.default_update = new_rstate
-    f_gpu = aesara.function([], sample, mode=mode)
-
-    utt.assert_allclose(f_cpu(), f_gpu())
-
-
-def test_overflow_gpu_new_backend():
-    seed = 12345
-    n_substreams = 7
-    curr_rstate = np.array([seed] * 6, dtype="int32")
-    rstate = [curr_rstate.copy()]
-    for j in range(1, n_substreams):
-        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
-    rstate = np.asarray(rstate)
-    rstate = gpuarray_shared_constructor(rstate)
-    fct = functools.partial(GPUA_mrg_uniform.new, rstate, ndim=None, dtype="float32")
-    # should raise error as the size overflows
-    sizes = [
-        (2**31,),
-        (2**32,),
-        (
-            2**15,
-            2**16,
-        ),
-        (2, 2**15, 2**15),
-    ]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
-    # should not raise error
-    sizes = [(2**5,), (2**5, 2**5), (2**5, 2**5, 2**5)]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
-    # should support int32 sizes
-    sizes = [(np.int32(2**10),), (np.int32(2), np.int32(2**10), np.int32(2**10))]
-    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
-
-
-def test_validate_input_types_gpuarray_backend():
-    with config.change_flags(compute_test_value="raise"):
-        rstate = np.zeros((7, 6), dtype="int32")
-        rstate = gpuarray_shared_constructor(rstate)
-        rng_mrg.mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
-
-
-def test_f16_nonzero():
-    try:
-        # To have aesara.shared(x) try to move on the GPU
-        aesara.compile.shared_constructor(gpuarray_shared_constructor)
-        cpu_f16_nonzero(mode=mode, op_to_check=GPUA_mrg_uniform)
-    finally:
-        aesara.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
-
-
-def test_cpu_target_with_shared_variable():
-    srng = MRG_RandomStream()
-    s = np.random.rand(2, 3).astype("float32")
-    x = gpuarray_shared_constructor(s, name="x")
-    try:
-        # To have aesara.shared(x) try to move on the GPU
-        aesara.compile.shared_constructor(gpuarray_shared_constructor)
-        y = srng.uniform(x.shape, target="cpu")
-        y.name = "y"
-        z = (x * y).sum()
-        z.name = "z"
-
-        fz = aesara.function([], z, mode=mode)
-
-        nodes = fz.maker.fgraph.toposort()
-        assert not any(isinstance(node.op, GPUA_mrg_uniform) for node in nodes)
-    finally:
-        aesara.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
--- a/tests/gpuarray/test_scan.py
+++ b/tests/gpuarray/test_scan.py
-import numpy as np
-import pytest
-
-import aesara
-import aesara.sandbox.rng_mrg
-from aesara import gpuarray
-from aesara import tensor as at
-from aesara.gpuarray.basic_ops import GpuFromHost, HostFromGpu
-from aesara.gpuarray.elemwise import GpuElemwise
-from aesara.scan.basic import scan
-from aesara.scan.checkpoints import scan_checkpoints
-from aesara.scan.op import Scan
-from aesara.tensor.math import dot
-from aesara.tensor.math import sum as at_sum
-from aesara.tensor.type import fscalar, ftensor3, fvector, iscalar, vector
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-
-
-pygpu_gpuarray = pytest.importorskip("pygpy.gpuarray")
-GpuArrayException = pygpu_gpuarray.GpuArrayException
-
-
-if aesara.config.mode == "FAST_COMPILE":
-    mode_with_opt = aesara.compile.mode.get_mode("FAST_RUN")
-else:
-    mode_with_opt = aesara.compile.mode.get_default_mode()
-if aesara.config.mode in ("DEBUG_MODE", "DebugMode"):
-    mode_nodebug = aesara.compile.mode.get_mode("FAST_RUN")
-else:
-    mode_nodebug = mode_with_opt
-
-
-class TestScan:
-    def test_one_sequence_one_output_weights_gpu1(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return u_t * W_in + x_tm1 * W
-
-        u = fvector("u")
-        x0 = fscalar("x0")
-        W_in = fscalar("win")
-        W = fscalar("w")
-
-        mode = mode_with_gpu.excluding("InputToGpuOptimizer")
-        output, updates = scan(
-            f_rnn,
-            u,
-            x0,
-            [W_in, W],
-            n_steps=None,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=mode,
-        )
-
-        output = GpuFromHost(test_ctx_name)(output)
-        f2 = aesara.function(
-            [u, x0, W_in, W],
-            output,
-            updates=updates,
-            allow_input_downcast=True,
-            mode=mode,
-        )
-
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        v_u = np.asarray(v_u, dtype="float32")
-        v_x0 = np.asarray(v_x0, dtype="float32")
-        W = np.asarray(W, dtype="float32")
-        W_in = np.asarray(W_in, dtype="float32")
-
-        # compute the output in numpy
-        v_out = np.zeros((4,))
-        v_out[0] = v_u[0] * W_in + v_x0 * W
-        for step in range(1, 4):
-            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
-
-        aesara_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(aesara_values, v_out)
-
-        # TO DEL
-        topo = f2.maker.fgraph.toposort()
-        scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-
-        topo = f2.maker.fgraph.toposort()
-        assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0
-        assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4
-
-        scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert any(isinstance(node.op, GpuElemwise) for node in scan_node_topo)
-        assert not any(isinstance(node.op, HostFromGpu) for node in scan_node_topo)
-        assert not any(isinstance(node.op, GpuFromHost) for node in scan_node_topo)
-
-    # This second version test the second case in the optimizer to the gpu.
-    def test_one_sequence_one_output_weights_gpu2(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return u_t * W_in + x_tm1 * W
-
-        u = fvector("u")
-        x0 = fscalar("x0")
-        W_in = fscalar("win")
-        W = fscalar("w")
-        output, updates = scan(
-            f_rnn,
-            u,
-            x0,
-            [W_in, W],
-            n_steps=None,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=mode_with_gpu,
-        )
-
-        f2 = aesara.function(
-            [u, x0, W_in, W],
-            output,
-            updates=updates,
-            allow_input_downcast=True,
-            mode=mode_with_gpu,
-        )
-
-        # get random initial values
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        # compute the output in numpy
-        v_out = np.zeros((4,))
-        v_out[0] = v_u[0] * W_in + v_x0 * W
-        for step in range(1, 4):
-            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
-
-        aesara_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(aesara_values, v_out)
-
-        topo = f2.maker.fgraph.toposort()
-        assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 1
-        assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4
-
-        scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert any(isinstance(node.op, GpuElemwise) for node in scan_node_topo)
-        assert not any(isinstance(node.op, HostFromGpu) for node in scan_node_topo)
-        assert not any(isinstance(node.op, GpuFromHost) for node in scan_node_topo)
-
-    # This third test checks that scan can deal with a mixture of dtypes as
-    # outputs when is running on GPU
-    def test_gpu3_mixture_dtype_outputs(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return (u_t * W_in + x_tm1 * W, at.cast(u_t + x_tm1, "int64"))
-
-        u = fvector("u")
-        x0 = fscalar("x0")
-        W_in = fscalar("win")
-        W = fscalar("w")
-        output, updates = scan(
-            f_rnn,
-            u,
-            [x0, None],
-            [W_in, W],
-            n_steps=None,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=mode_with_gpu,
-        )
-
-        f2 = aesara.function(
-            [u, x0, W_in, W],
-            output,
-            updates=updates,
-            allow_input_downcast=True,
-            mode=mode_with_gpu,
-        )
-
-        # get random initial values
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        # compute the output in numpy
-        v_out1 = np.zeros((4,))
-        v_out2 = np.zeros((4,), dtype="int64")
-        v_out1[0] = v_u[0] * W_in + v_x0 * W
-        v_out2[0] = v_u[0] + v_x0
-        for step in range(1, 4):
-            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
-            v_out2[step] = np.int64(v_u[step] + v_out1[step - 1])
-
-        aesara_out1, aesara_out2 = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(aesara_out1, v_out1)
-        utt.assert_allclose(aesara_out2, v_out2)
-
-        topo = f2.maker.fgraph.toposort()
-        scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        assert scan_node.op.gpua
-
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert not any(isinstance(node.op, HostFromGpu) for node in scan_node_topo)
-        assert not any(isinstance(node.op, GpuFromHost) for node in scan_node_topo)
-
-    def test_gpu4_gibbs_chain(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_vsample = np.array(
-            rng.binomial(
-                1,
-                0.5,
-                size=(3, 20),
-            ),
-            dtype="float32",
-        )
-        vsample = aesara.shared(v_vsample)
-        trng = aesara.sandbox.rng_mrg.MRG_RandomStream(utt.fetch_seed())
-
-        def f(vsample_tm1):
-            return (
-                trng.binomial(vsample_tm1.shape, n=1, p=0.3, dtype="float32")
-                * vsample_tm1
-            )
-
-        aesara_vsamples, updates = scan(
-            f,
-            [],
-            vsample,
-            [],
-            n_steps=10,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=mode_with_gpu,
-        )
-        my_f = aesara.function(
-            [],
-            aesara_vsamples[-1],
-            updates=updates,
-            allow_input_downcast=True,
-            mode=mode_with_gpu,
-        )
-
-        # I leave this to tested by debugmode, this test was anyway
-        # more of does the graph compile kind of test
-        my_f()
-
-
-class ScanGpuTests:
-    """
-    This class defines a number of tests for Scan on GPU as well as a few
-    helper functions for these tests. The GPU tests defined in this class are
-    independent of the GPU backend used. Because of this, a class inheriting
-    from ScanGpuTests should define the following attributes and methods to
-    make the tests run on a specific backend :
-    - self.gpu_backend : Reference to the backend module
-    - self.mode_with_opt : Compilation mode to force usage of the gpu backend
-    - self.is_scan_on_gpu(node) : Method to determine is a scan node has been
-                                  moved to run on a gpu under the specific
-                                  backend. Returns a boolean.
-    """
-
-    def test_one_sequence_one_output_weights_gpu1(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return u_t * W_in + x_tm1 * W
-
-        u = fvector("u")
-        x0 = fscalar("x0")
-        W_in = fscalar("win")
-        W = fscalar("w")
-
-        # The following line is needed to have the first case being used
-        # Otherwise, it is the second that is tested.
-        mode = self.mode_with_gpu.excluding("InputToGpuOptimizer")
-        output, updates = scan(
-            f_rnn,
-            u,
-            x0,
-            [W_in, W],
-            n_steps=None,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=mode,
-        )
-
-        output = self.gpu_backend.gpu_from_host(output)
-        f2 = aesara.function(
-            [u, x0, W_in, W],
-            output,
-            updates=updates,
-            allow_input_downcast=True,
-            mode=self.mode_with_gpu,
-        )
-
-        # get random initial values
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        v_u = np.asarray(v_u, dtype="float32")
-        v_x0 = np.asarray(v_x0, dtype="float32")
-        W = np.asarray(W, dtype="float32")
-        W_in = np.asarray(W_in, dtype="float32")
-
-        # compute the output in numpy
-        v_out = np.zeros((4,))
-        v_out[0] = v_u[0] * W_in + v_x0 * W
-        for step in range(1, 4):
-            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
-        aesara_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(aesara_values, v_out)
-
-        # TO DEL
-        topo = f2.maker.fgraph.toposort()
-        scan_node = [node for node in topo if isinstance(node.op, Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-
-        topo = f2.maker.fgraph.toposort()
-        assert (
-            sum([isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo])
-            == 0
-        )
-        assert (
-            sum([isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo])
-            == 4
-        )
-
-        scan_node = [node for node in topo if isinstance(node.op, Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert any(
-            [
-                isinstance(node.op, self.gpu_backend.GpuElemwise)
-                for node in scan_node_topo
-            ]
-        )
-        assert not any(
-            [
-                isinstance(node.op, self.gpu_backend.HostFromGpu)
-                for node in scan_node_topo
-            ]
-        )
-        assert not any(
-            [
-                isinstance(node.op, self.gpu_backend.GpuFromHost)
-                for node in scan_node_topo
-            ]
-        )
-
-    # This second version test the second case in the optimizer to the gpu.
-    def test_one_sequence_one_output_weights_gpu2(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return u_t * W_in + x_tm1 * W
-
-        u = fvector("u")
-        x0 = fscalar("x0")
-        W_in = fscalar("win")
-        W = fscalar("w")
-        output, updates = scan(
-            f_rnn,
-            u,
-            x0,
-            [W_in, W],
-            n_steps=None,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=self.mode_with_gpu,
-        )
-
-        f2 = aesara.function(
-            [u, x0, W_in, W],
-            output,
-            updates=updates,
-            allow_input_downcast=True,
-            mode=self.mode_with_gpu,
-        )
-
-        # get random initial values
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        # compute the output in numpy
-        v_out = np.zeros((4,))
-        v_out[0] = v_u[0] * W_in + v_x0 * W
-        for step in range(1, 4):
-            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
-        aesara_values = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(aesara_values, v_out)
-
-        topo = f2.maker.fgraph.toposort()
-        assert (
-            sum([isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo])
-            == 1
-        )
-        assert (
-            sum([isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo])
-            == 4
-        )
-
-        scan_node = [node for node in topo if isinstance(node.op, Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()
-
-        # check that there is no gpu transfer in the inner loop.
-        assert any(
-            [
-                isinstance(node.op, self.gpu_backend.GpuElemwise)
-                for node in scan_node_topo
-            ]
-        )
-        assert not any(
-            [
-                isinstance(node.op, self.gpu_backend.HostFromGpu)
-                for node in scan_node_topo
-            ]
-        )
-        assert not any(
-            [
-                isinstance(node.op, self.gpu_backend.GpuFromHost)
-                for node in scan_node_topo
-            ]
-        )
-
-    # This third test checks that scan can deal with a mixture of dtypes as
-    # outputs when is running on GPU
-    def test_gpu3_mixture_dtype_outputs(self):
-        def f_rnn(u_t, x_tm1, W_in, W):
-            return (u_t * W_in + x_tm1 * W, at.cast(u_t + x_tm1, "int64"))
-
-        u = fvector("u")
-        x0 = fscalar("x0")
-        W_in = fscalar("win")
-        W = fscalar("w")
-        output, updates = scan(
-            f_rnn,
-            u,
-            [x0, None],
-            [W_in, W],
-            n_steps=None,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=self.mode_with_gpu,
-        )
-
-        f2 = aesara.function(
-            [u, x0, W_in, W],
-            output,
-            updates=updates,
-            allow_input_downcast=True,
-            mode=self.mode_with_gpu,
-        )
-
-        # get random initial values
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
-        v_x0 = rng.uniform()
-        W = rng.uniform()
-        W_in = rng.uniform()
-
-        # compute the output in numpy
-        v_out1 = np.zeros((4,))
-        v_out2 = np.zeros((4,), dtype="int64")
-        v_out1[0] = v_u[0] * W_in + v_x0 * W
-        v_out2[0] = v_u[0] + v_x0
-        for step in range(1, 4):
-            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
-            v_out2[step] = np.int64(v_u[step] + v_out1[step - 1])
-
-        aesara_out1, aesara_out2 = f2(v_u, v_x0, W_in, W)
-        utt.assert_allclose(aesara_out1, v_out1)
-        utt.assert_allclose(aesara_out2, v_out2)
-
-        topo = f2.maker.fgraph.toposort()
-        scan_node = [node for node in topo if isinstance(node.op, Scan)]
-        assert len(scan_node) == 1
-        scan_node = scan_node[0]
-        assert self.is_scan_on_gpu(scan_node)
-
-    def test_gibbs_chain(self):
-        rng = np.random.default_rng(utt.fetch_seed())
-        v_vsample = np.array(
-            rng.binomial(
-                1,
-                0.5,
-                size=(3, 20),
-            ),
-            dtype="float32",
-        )
-        vsample = aesara.shared(v_vsample)
-        trng = aesara.sandbox.rng_mrg.MRG_RandomStream(utt.fetch_seed())
-
-        def f(vsample_tm1):
-            return (
-                trng.binomial(vsample_tm1.shape, n=1, p=0.3, dtype="float32")
-                * vsample_tm1
-            )
-
-        aesara_vsamples, updates = scan(
-            f,
-            [],
-            vsample,
-            [],
-            n_steps=10,
-            truncate_gradient=-1,
-            go_backwards=False,
-            mode=self.mode_with_gpu,
-        )
-        my_f = aesara.function(
-            [],
-            aesara_vsamples[-1],
-            updates=updates,
-            allow_input_downcast=True,
-            mode=self.mode_with_gpu,
-        )
-
-        # I leave this to tested by debugmode, this test was anyway more of
-        # doest the graph compile kind of test
-        my_f()
-
-    def test_gpu_memory_usage(self):
-        # This test validates that the memory usage of the defined aesara
-        # function is reasonable when executed on the GPU. It checks for
-        # a bug in which one of scan's optimization was not applied which
-        # made the scan node compute large and unnecessary outputs which
-        # brought memory usage on the GPU to ~12G.
-
-        # Dimensionality of input and output data (not one-hot coded)
-        n_in = 100
-        n_out = 100
-        # Number of neurons in hidden layer
-        n_hid = 4000
-
-        # Number of minibatches
-        mb_size = 2
-        # Time steps in minibatch
-        mb_length = 200
-
-        # Define input variables
-        xin = ftensor3(name="xin")
-        yout = ftensor3(name="yout")
-
-        # Initialize the network parameters
-        U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1")
-        V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1")
-        W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2")
-        nparams = [U, V, W]
-
-        # Build the forward pass
-        l1_base = dot(xin, U)
-
-        def scan_l(baseline, last_step):
-            return baseline + dot(last_step, V)
-
-        zero_output = at.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid)
-
-        l1_out, _ = scan(
-            scan_l,
-            sequences=[l1_base],
-            outputs_info=[zero_output],
-            mode=self.mode_with_gpu_nodebug,
-        )
-
-        l2_out = dot(l1_out, W)
-
-        # Compute the cost and take the gradient wrt params
-        cost = at_sum((l2_out - yout) ** 2)
-        grads = aesara.grad(cost, nparams)
-        updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))
-
-        # Compile the aesara function
-        feval_backprop = aesara.function(
-            [xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug
-        )
-
-        # Validate that the PushOutScanOutput optimization has been applied
-        # by checking the number of outputs of the grad Scan node in the
-        # compiled function.
-        nodes = feval_backprop.maker.fgraph.toposort()
-        scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]
-
-        # The grad scan is always the 2nd one according to toposort. If the
-        # optimization has been applied, it has 2 outputs, otherwise 3.
-        grad_scan_node = scan_nodes[1]
-        assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)
-
-        # Call the aesara function to ensure the absence of a memory error
-        feval_backprop(
-            np.zeros((mb_length, mb_size, n_in), dtype="float32"),
-            np.zeros((mb_length, mb_size, n_out), dtype="float32"),
-        )
-
-    def test_memory_reuse_gpudimshuffle(self):
-        # Test the memory pre-allocation feature in scan when one output is
-        # the result of a GpuDimshuffle (because an optimization in
-        # GpuDimshuffle can cause issues with the memory pre-allocation
-        # where it falsely thinks that a pre-allocated memory region has
-        # been used when it hasn't).
-        def inner_fn(seq1, recurrent_out):
-            temp = seq1 + recurrent_out.sum()
-            output1 = temp.dimshuffle(1, 0)
-            output2 = temp.sum() + recurrent_out
-            return output1, output2
-
-        input1 = ftensor3()
-        init = ftensor3()
-        outputs_info = [None, init]
-
-        out, _ = scan(
-            inner_fn,
-            sequences=[input1],
-            outputs_info=outputs_info,
-            mode=self.mode_with_gpu,
-        )
-
-        out1 = out[0].flatten()
-        out2 = out[1].flatten()
-
-        fct = aesara.function([input1, init], [out1, out2], mode=self.mode_with_gpu)
-
-        output = fct(
-            np.ones((2, 1, 1), dtype="float32"), np.ones((1, 1, 1), dtype="float32")
-        )
-
-        expected_output = (
-            np.array([2, 4], dtype="float32"),
-            np.array([3, 7], dtype="float32"),
-        )
-        utt.assert_allclose(output, expected_output)
-
-
-class TestScanGpuarray(ScanGpuTests):
-    """
-    This class takes the gpu tests for scan that are defined in
-    class ScanGpuTests and runs them using the gpuarray backend.
-    """
-
-    def setup_method(self):
-
-        self.gpu_backend = gpuarray
-
-        # This is unfortunate, but required
-        def gpu_from_host(v):
-            return gpuarray.GpuFromHost(None)(v)
-
-        self.gpu_backend.gpu_from_host = gpu_from_host
-
-        self.mode_with_gpu = mode_with_opt.including("gpuarray", "scan")
-        self.mode_with_gpu_nodebug = mode_nodebug.including("gpuarray", "scan")
-
-        # Skip the test if pygpu is not available
-        if not self.gpu_backend.pygpu_activated:
-            pytest.skip("Optional package pygpu disabled")
-
-    def is_scan_on_gpu(self, node):
-        return node.op.info.get("gpua", False)
-
-
-class TestScanCheckpoint:
-    def setup_method(self):
-        self.k = iscalar("k")
-        self.A = vector("A")
-        result, _ = scan(
-            fn=lambda prior_result, A: prior_result * A,
-            outputs_info=at.ones_like(self.A),
-            non_sequences=self.A,
-            n_steps=self.k,
-        )
-        result_check, _ = scan_checkpoints(
-            fn=lambda prior_result, A: prior_result * A,
-            outputs_info=at.ones_like(self.A),
-            non_sequences=self.A,
-            n_steps=self.k,
-            save_every_N=100,
-        )
-        self.result = result[-1]
-        self.result_check = result_check[-1]
-        self.grad_A = aesara.grad(self.result.sum(), self.A)
-        self.grad_A_check = aesara.grad(self.result_check.sum(), self.A)
-
-    def test_memory(self):
-        from tests.gpuarray.config import mode_with_gpu  # noqa
-
-        f = aesara.function(
-            inputs=[self.A, self.k], outputs=self.grad_A, mode=mode_with_gpu
-        )
-        f_check = aesara.function(
-            inputs=[self.A, self.k], outputs=self.grad_A_check, mode=mode_with_gpu
-        )
-        free_gmem = aesara.gpuarray.type._context_reg[None].free_gmem
-        data = np.ones(free_gmem // 3000, dtype=np.float32)
-        # Check that it works with the checkpoints
-        size = 1000
-        if isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
-            size = 100
-        f_check(data, size)
-        # Check that the basic scan fails in that case
-        # Skip that check in DebugMode, as it can fail in different ways
-        if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode):
-            with pytest.raises(GpuArrayException):
-                f(data, 1000)
--- a/tests/gpuarray/test_sort.py
+++ b/tests/gpuarray/test_sort.py
-from aesara.gpuarray.sort import GpuTopKOp
-from tests.gpuarray.config import mode_with_gpu
-from tests.tensor.test_sort import TestTopK
-
-
-class TestGpuTopK(TestTopK):
-    mode = mode_with_gpu
-    dtype = "float32"
-    op_class = GpuTopKOp
--- a/tests/gpuarray/test_subtensor.py
+++ b/tests/gpuarray/test_subtensor.py
-import numpy as np
-
-import aesara
-from aesara.compile import DeepCopyOp
-from aesara.gpuarray.basic_ops import GpuContiguous, GpuFromHost, HostFromGpu
-from aesara.gpuarray.elemwise import GpuDimShuffle
-from aesara.gpuarray.subtensor import (
-    GpuAdvancedIncSubtensor,
-    GpuAdvancedIncSubtensor1,
-    GpuAdvancedIncSubtensor1_dev20,
-    GpuAdvancedSubtensor,
-    GpuAdvancedSubtensor1,
-    GpuAllocDiag,
-    GpuExtractDiag,
-    GpuIncSubtensor,
-    GpuSubtensor,
-)
-from aesara.gpuarray.type import gpuarray_shared_constructor
-from aesara.tensor.basic import AllocDiag, ExtractDiag
-from aesara.tensor.math import sum as at_sum
-from aesara.tensor.subtensor import advanced_inc_subtensor1, inc_subtensor
-from aesara.tensor.type import ivectors, matrix, tensor, tensor4, vector
-from tests import unittest_tools as utt
-from tests.gpuarray.config import mode_with_gpu, test_ctx_name
-from tests.tensor.test_basic import TestAllocDiag
-from tests.tensor.test_subtensor import TestAdvancedSubtensor, TestSubtensor
-
-
-class TestGPUSubtensor(TestSubtensor):
-    def setup_method(self):
-        def shared(x, **kwargs):
-            return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
-
-        self.shared = shared
-        self.sub = GpuSubtensor
-        self.inc_sub = GpuIncSubtensor
-        self.adv_sub1 = GpuAdvancedSubtensor1
-        self.adv_incsub1 = GpuAdvancedIncSubtensor1
-        self.adv_sub = GpuAdvancedSubtensor
-        self.dimshuffle = GpuDimShuffle
-        self.mode = mode_with_gpu
-        # avoid errors with limited devices
-        self.dtype = "float32"
-        self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp, GpuContiguous)
-        # GPU opt can't run in fast_compile only.
-        self.fast_compile = False
-        assert self.sub == GpuSubtensor
-        super().setup_method()
-
-
-class TestGPUSubtensorF16(TestSubtensor):
-    def setup_method(self):
-        def shared(x, **kwargs):
-            return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
-
-        self.shared = shared
-        self.sub = GpuSubtensor
-        self.inc_sub = GpuIncSubtensor
-        self.adv_sub1 = GpuAdvancedSubtensor1
-        self.adv_incsub1 = GpuAdvancedIncSubtensor1
-        self.adv_sub = GpuAdvancedSubtensor
-        self.dimshuffle = GpuDimShuffle
-        self.mode = mode_with_gpu
-        # avoid errors with limited devices
-        self.dtype = "float16"  # use floatX?
-        self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp, GpuContiguous)
-        # GPU opt can't run in fast_compile only.
-        self.fast_compile = False
-        assert self.sub == GpuSubtensor
-        super().setup_method()
-
-
-def test_advinc_subtensor1():
-    # Test the second case in the opt local_gpu_advanced_incsubtensor1
-    for shp in [(3, 3), (3, 3, 3)]:
-        shared = gpuarray_shared_constructor
-        xval = np.arange(np.prod(shp), dtype="float32").reshape(shp) + 1
-        yval = np.empty((2,) + shp[1:], dtype="float32")
-        yval[:] = 10
-        x = shared(xval, name="x")
-        y = tensor(dtype="float32", broadcastable=(False,) * len(shp), name="y")
-        expr = advanced_inc_subtensor1(x, y, [0, 2])
-        f = aesara.function([y], expr, mode=mode_with_gpu)
-        assert (
-            sum(
-                [
-                    isinstance(node.op, GpuAdvancedIncSubtensor1)
-                    for node in f.maker.fgraph.toposort()
-                ]
-            )
-            == 1
-        )
-        rval = f(yval)
-        rep = xval.copy()
-        np.add.at(rep, [0, 2], yval)
-        assert np.allclose(rval, rep)
-
-
-def test_advinc_subtensor1_dtype():
-    # Test the mixed dtype case
-    shp = (3, 4)
-    for dtype1, dtype2 in [
-        ("float32", "int8"),
-        ("float32", "float64"),
-        ("uint64", "int8"),
-        ("int64", "uint8"),
-        ("float16", "int8"),
-        ("float16", "float64"),
-        ("float16", "float16"),
-    ]:
-        shared = gpuarray_shared_constructor
-        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
-        yval = np.empty((2,) + shp[1:], dtype=dtype2)
-        yval[:] = 10
-        x = shared(xval, name="x")
-        y = tensor(dtype=yval.dtype, broadcastable=(False,) * len(yval.shape), name="y")
-        expr = advanced_inc_subtensor1(x, y, [0, 2])
-        f = aesara.function([y], expr, mode=mode_with_gpu)
-        assert (
-            sum(
-                [
-                    isinstance(node.op, GpuAdvancedIncSubtensor1_dev20)
-                    for node in f.maker.fgraph.toposort()
-                ]
-            )
-            == 1
-        )
-        rval = f(yval)
-        rep = xval.copy()
-        np.add.at(rep, [[0, 2]], yval)
-        assert np.allclose(rval, rep)
-
-
-@aesara.config.change_flags(deterministic="more")
-def test_deterministic_flag():
-    shp = (3, 4)
-    for dtype1, dtype2 in [("float32", "int8")]:
-        shared = gpuarray_shared_constructor
-        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
-        yval = np.empty((2,) + shp[1:], dtype=dtype2)
-        yval[:] = 10
-        x = shared(xval, name="x")
-        y = tensor(dtype=yval.dtype, broadcastable=(False,) * len(yval.shape), name="y")
-        expr = advanced_inc_subtensor1(x, y, [0, 2])
-        f = aesara.function([y], expr, mode=mode_with_gpu)
-        assert (
-            sum(
-                [
-                    isinstance(node.op, GpuAdvancedIncSubtensor1)
-                    for node in f.maker.fgraph.toposort()
-                ]
-            )
-            == 1
-        )
-        rval = f(yval)
-        rep = xval.copy()
-        np.add.at(rep, [[0, 2]], yval)
-        assert np.allclose(rval, rep)
-
-
-def test_advinc_subtensor1_vector_scalar():
-    # Test the case where x is a vector and y a scalar
-    shp = (3,)
-    for dtype1, dtype2 in [
-        ("float32", "int8"),
-        ("float32", "float64"),
-        ("float16", "int8"),
-        ("float16", "float64"),
-        ("float16", "float16"),
-        ("int8", "int8"),
-        ("int16", "int16"),
-    ]:
-        shared = gpuarray_shared_constructor
-        xval = np.arange(np.prod(shp), dtype=dtype1).reshape(shp) + 1
-        yval = np.asarray(10, dtype=dtype2)
-        x = shared(xval, name="x")
-        y = tensor(dtype=yval.dtype, broadcastable=(False,) * len(yval.shape), name="y")
-        expr = advanced_inc_subtensor1(x, y, [0, 2])
-        f = aesara.function([y], expr, mode=mode_with_gpu)
-
-        assert (
-            sum(
-                [
-                    isinstance(
-                        node.op,
-                        (GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1),
-                    )
-                    for node in f.maker.fgraph.toposort()
-                ]
-            )
-            == 1
-        )
-        rval = f(yval)
-        rep = xval.copy()
-        rep[[0, 2]] += yval
-        assert np.allclose(rval, rep)
-
-
-def test_incsub_f16():
-    shp = (3, 3)
-    shared = gpuarray_shared_constructor
-    xval = np.arange(np.prod(shp), dtype="float16").reshape(shp) + 1
-    yval = np.empty((2,) + shp[1:], dtype="float16")
-    yval[:] = 2
-    x = shared(xval, name="x")
-    y = tensor(dtype="float16", broadcastable=(False,) * len(shp), name="y")
-    expr = advanced_inc_subtensor1(x, y, [0, 2])
-    f = aesara.function([y], expr, mode=mode_with_gpu)
-    assert (
-        sum(
-            [
-                isinstance(node.op, GpuAdvancedIncSubtensor1)
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-        == 1
-    )
-    rval = f(yval)
-    rep = xval.copy()
-    np.add.at(rep, [[0, 2]], yval)
-    assert np.allclose(rval, rep)
-
-    expr = inc_subtensor(x[1:], y)
-    f = aesara.function([y], expr, mode=mode_with_gpu)
-    assert (
-        sum(
-            [isinstance(node.op, GpuIncSubtensor) for node in f.maker.fgraph.toposort()]
-        )
-        == 1
-    )
-    rval = f(yval)
-    rep = xval.copy()
-    rep[1:] += yval
-    assert np.allclose(rval, rep)
-
-
-def test_incsub_offset():
-    # Test for https://github.com/Theano/Theano/issues/5670
-
-    # Build a GPU variable which value will have an offset (x1)
-    x = gpuarray_shared_constructor(np.zeros(5, dtype=aesara.config.floatX))
-    x1 = x[1:]
-    # Use inc_subtensor on it
-    y = vector()
-    z = inc_subtensor(x1[2:], y)
-    # Use updates so that inc_subtensor can happen inplace
-    f = aesara.function([y], z, updates={x: z}, mode=mode_with_gpu)
-    utt.assert_allclose(f([1, 2]), np.array([0, 0, 1, 2], dtype=aesara.config.floatX))
-
-
-class TestGPUAdvancedSubtensor(TestAdvancedSubtensor):
-    def setup_method(self):
-        self.shared = gpuarray_shared_constructor
-        self.sub = GpuAdvancedSubtensor
-        self.inc_sub = GpuAdvancedIncSubtensor
-        self.mode = mode_with_gpu
-        # avoid errors with limited devices
-        self.dtype = "float32"  # floatX?
-        self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp)
-        # GPU opt can't run in fast_compile only.
-        self.fast_compile = False
-        assert self.sub == GpuAdvancedSubtensor
-        super().setup_method()
-
-
-class TestGPUAdvancedSubtensorF16(TestAdvancedSubtensor):
-    def setup_method(self):
-        self.shared = gpuarray_shared_constructor
-        self.sub = GpuAdvancedSubtensor
-        self.mode = mode_with_gpu
-        # avoid errors with limited devices
-        self.dtype = "float16"  # floatX?
-        self.ignore_topo = (HostFromGpu, GpuFromHost, DeepCopyOp)
-        # GPU opt can't run in fast_compile only.
-        self.fast_compile = False
-        assert self.sub == GpuAdvancedSubtensor
-        super().setup_method()
-
-
-def test_adv_subtensor():
-    # Test the advancedsubtensor on gpu.
-    shp = (2, 3, 4)
-    shared = gpuarray_shared_constructor
-    xval = np.arange(np.prod(shp), dtype=aesara.config.floatX).reshape(shp)
-    idx1, idx2 = ivectors("idx1", "idx2")
-    idxs = [idx1, None, slice(0, 2, 1), idx2, None]
-    x = shared(xval, name="x")
-    expr = x[idxs]
-    f = aesara.function([idx1, idx2], expr, mode=mode_with_gpu)
-    assert (
-        sum(
-            [
-                isinstance(node.op, GpuAdvancedSubtensor)
-                for node in f.maker.fgraph.toposort()
-            ]
-        )
-        == 1
-    )
-    idx1_val = [0, 1]
-    idx2_val = [0, 1]
-    rval = f(idx1_val, idx2_val)
-    rep = xval[idx1_val, None, slice(0, 2, 1), idx2_val, None]
-    assert np.allclose(rval, rep)
-
-
-class TestGpuExtractDiag:
-    def test_extractdiag_opt(self):
-        x = matrix()
-        fn = aesara.function([x], ExtractDiag()(x), mode=mode_with_gpu)
-        assert any(
-            [isinstance(node.op, GpuExtractDiag) for node in fn.maker.fgraph.toposort()]
-        )
-
-    def test_matrix(self):
-        x = matrix()
-        np_x = np.arange(77).reshape(7, 11).astype(aesara.config.floatX)
-        fn = aesara.function([x], GpuExtractDiag()(x), mode=mode_with_gpu)
-        assert np.allclose(fn(np_x), np_x.diagonal())
-        fn = aesara.function([x], GpuExtractDiag(2)(x), mode=mode_with_gpu)
-        assert np.allclose(fn(np_x), np_x.diagonal(2))
-        fn = aesara.function([x], GpuExtractDiag(-3)(x), mode=mode_with_gpu)
-        assert np.allclose(fn(np_x), np_x.diagonal(-3))
-
-    def test_tensor(self):
-        x = tensor4()
-        np_x = np.arange(30107).reshape(7, 11, 17, 23).astype(aesara.config.floatX)
-        for offset, axis1, axis2 in [
-            (1, 0, 1),
-            (-1, 0, 1),
-            (0, 1, 0),
-            (-2, 1, 0),
-            (-3, 1, 0),
-            (-2, 2, 0),
-            (3, 3, 0),
-            (-1, 3, 2),
-            (2, 2, 3),
-            (-1, 2, 1),
-            (1, 3, 1),
-            (-1, 1, 3),
-        ]:
-            assert np.allclose(
-                GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
-                np_x.diagonal(offset, axis1, axis2),
-            )
-
-    def test_tensor_float16(self):
-        x = tensor4()
-        np_x = np.arange(30107).reshape(7, 11, 17, 23).astype("float16")
-        for offset, axis1, axis2 in [
-            (1, 0, 1),
-            (-1, 0, 1),
-            (0, 1, 0),
-            (-2, 1, 0),
-            (-3, 1, 0),
-            (-2, 2, 0),
-            (3, 3, 0),
-            (-1, 3, 2),
-            (2, 2, 3),
-            (-1, 2, 1),
-            (1, 3, 1),
-            (-1, 1, 3),
-        ]:
-            assert np.allclose(
-                GpuExtractDiag(offset, axis1, axis2)(x).eval({x: np_x}),
-                np_x.diagonal(offset, axis1, axis2),
-            )
-
-
-class TestGpuAllocDiag(TestAllocDiag):
-    def setup_method(self):
-        self.alloc_diag = GpuAllocDiag
-        self.mode = mode_with_gpu
-        super().setup_method()
-
-    def test_allocdiag_opt(self):
-        x = vector()
-        fn = aesara.function([x], AllocDiag()(x), mode=mode_with_gpu)
-        assert any(
-            [isinstance(node.op, GpuAllocDiag) for node in fn.maker.fgraph.toposort()]
-        )
-
-    def test_matrix(self):
-        x = vector()
-        np_x = np.arange(7).astype(aesara.config.floatX)
-        fn = aesara.function([x], GpuAllocDiag()(x), mode=mode_with_gpu)
-        assert np.allclose(fn(np_x), np.diag(np_x))
-        fn = aesara.function([x], GpuAllocDiag(2)(x), mode=mode_with_gpu)
-        assert np.allclose(fn(np_x), np.diag(np_x, 2))
-        fn = aesara.function([x], GpuAllocDiag(-3)(x), mode=mode_with_gpu)
-        assert np.allclose(fn(np_x), np.diag(np_x, -3))
-
-    def test_grad(self):
-        x = vector()
-        np_x = np.random.randn(7).astype(aesara.config.floatX)
-
-        # offset = 0 case:
-        mtx_x = GpuAllocDiag()(x)
-        sum_mtx_x = at_sum(mtx_x)
-        grad_x = aesara.grad(sum_mtx_x, x)
-        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)
-
-        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
-        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)
-
-        computed_grad_x = fn_grad_x(np_x)
-        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
-        true_grad_x = np.diagonal(computed_grad_mtx_x, 0)
-        assert np.allclose(computed_grad_x, true_grad_x)
-
-        # offset > 0 case:
-        mtx_x = GpuAllocDiag(2)(x)
-        sum_mtx_x = at_sum(mtx_x)
-        grad_x = aesara.grad(sum_mtx_x, x)
-        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)
-
-        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
-        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)
-
-        computed_grad_x = fn_grad_x(np_x)
-        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
-        true_grad_x = np.diagonal(computed_grad_mtx_x, 2)
-        assert np.allclose(computed_grad_x, true_grad_x)
-
-        # offset < 0 case:
-        mtx_x = GpuAllocDiag(-3)(x)
-        sum_mtx_x = at_sum(mtx_x)
-        grad_x = aesara.grad(sum_mtx_x, x)
-        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)
-
-        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
-        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)
-
-        computed_grad_x = fn_grad_x(np_x)
-        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
-        true_grad_x = np.diagonal(computed_grad_mtx_x, -3)
-        assert np.allclose(computed_grad_x, true_grad_x)
-
-        # assert
--- a/tests/gpuarray/test_type.py
+++ b/tests/gpuarray/test_type.py
-import os
-from pickle import Unpickler
-
-import numpy as np
-import pytest
-
-import aesara
-from aesara.compile.ops import DeepCopyOp, ViewOp
-from aesara.configdefaults import config
-from aesara.gpuarray.type import GpuArrayType, gpuarray_shared_constructor
-from aesara.tensor.basic import Rebroadcast
-from aesara.tensor.shape import specify_shape
-from aesara.tensor.type import row
-from tests.gpuarray.config import test_ctx_name
-from tests.gpuarray.test_basic_ops import rand_gpuarray
-
-
-pygpu = pytest.importorskip("pygpu")
-
-
-# Disabled for now
-# from tests.tensor.test_sharedvar import makeSharedTester
-
-
-def test_deep_copy():
-    for dtype in ("float16", "float32"):
-        a = rand_gpuarray(20, dtype=dtype)
-        g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
-
-        f = aesara.function([g], g)
-
-        assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
-
-        res = f(a)
-
-        assert GpuArrayType.values_eq(res, a)
-
-
-def test_view():
-    for dtype in ("float16", "float32"):
-        a = rand_gpuarray(20, dtype=dtype)
-        g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
-
-        m = aesara.compile.get_default_mode().excluding("local_view_op")
-        f = aesara.function([g], ViewOp()(g), mode=m)
-
-        assert isinstance(f.maker.fgraph.toposort()[0].op, ViewOp)
-
-        res = f(a)
-
-        assert GpuArrayType.values_eq(res, a)
-
-
-def test_rebroadcast():
-    for dtype in ("float16", "float32"):
-        a = rand_gpuarray(1, dtype=dtype)
-        g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
-
-        f = aesara.function([g], Rebroadcast((0, True))(g))
-
-        assert isinstance(f.maker.fgraph.toposort()[0].op, Rebroadcast)
-
-        res = f(a)
-
-        assert GpuArrayType.values_eq(res, a)
-
-
-def test_values_eq_approx():
-    a = rand_gpuarray(20, dtype="float32")
-    assert GpuArrayType.values_eq_approx(a, a)
-    b = a.copy()
-    b[0] = np.asarray(b[0]) + 1.0
-    assert not GpuArrayType.values_eq_approx(a, b)
-    b = a.copy()
-    b[0] = -np.asarray(b[0])
-    assert not GpuArrayType.values_eq_approx(a, b)
-
-
-def test_specify_shape():
-    for dtype in ("float16", "float32"):
-        a = rand_gpuarray(20, dtype=dtype)
-        g = GpuArrayType(dtype=dtype, broadcastable=(False,))("g")
-        f = aesara.function([g], specify_shape(g, [20]))
-        f(a)
-
-
-def test_filter_float():
-    aesara.compile.shared_constructor(gpuarray_shared_constructor)
-    try:
-        s = aesara.shared(np.array(0.0, dtype="float32"), target=test_ctx_name)
-        aesara.function([], updates=[(s, 0.0)])
-    finally:
-        del aesara.compile.sharedvalue.shared.constructors[-1]
-
-
-def test_filter_variable():
-    # Test that filter_variable accepts more restrictive broadcast
-    gpu_row = GpuArrayType(dtype=aesara.config.floatX, broadcastable=(True, False))
-    gpu_matrix = GpuArrayType(dtype=aesara.config.floatX, broadcastable=(False, False))
-    r = gpu_row()
-    m = gpu_matrix.filter_variable(r)
-    assert m.type == gpu_matrix
-
-    # On CPU as well
-    r = row()
-    m = gpu_matrix.filter_variable(r)
-    assert m.type == gpu_matrix
-
-
-def test_gpuarray_shared_scalar():
-    # By default, we don't put scalar as shared variable on the GPU
-    with pytest.raises(TypeError):
-        gpuarray_shared_constructor(np.asarray(1, dtype="float32"))
-
-    # But we can force that
-    gpuarray_shared_constructor(np.asarray(1, dtype="float32"), target=test_ctx_name)
-
-
-def test_unpickle_gpuarray_as_numpy_ndarray_flag0():
-    # Test when pygpu isn't there for unpickle are in test_pickle.py
-    oldflag = config.experimental__unpickle_gpu_on_cpu
-    config.experimental__unpickle_gpu_on_cpu = False
-
-    try:
-        testfile_dir = os.path.dirname(os.path.realpath(__file__))
-        fname = "GpuArray.pkl"
-
-        with open(os.path.join(testfile_dir, fname), "rb") as fp:
-            u = Unpickler(fp, encoding="latin1")
-            mat = u.load()
-            assert isinstance(mat, pygpu.gpuarray.GpuArray)
-            assert np.asarray(mat)[0] == -42.0
-    finally:
-        config.experimental__unpickle_gpu_on_cpu = oldflag
-
-
-# These tests are disabled because they expect the impossible
-# @makeSharedTester(
-#     shared_constructor_=gpuarray_shared_constructor,
-#     dtype_=aesara.config.floatX,
-#     get_value_borrow_true_alias_=True,
-#     shared_borrow_true_alias_=True,
-#     set_value_borrow_true_alias_=True,
-#     set_value_inplace_=True,
-#     set_cast_value_inplace_=False,
-#     shared_constructor_accept_ndarray_=True,
-#     internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
-#                                          cls=pygpu._array.ndgpuarray),
-#     test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
-#     aesara_fct_=aesara.tensor.exp,
-#     ref_fct_=np.exp,
-#     cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
-#                                       cls=pygpu._array.ndgpuarray))
-# class TestSharedOptions(object):
-#         pass
-
-
-# @makeSharedTester(
-#     shared_constructor_=gpuarray_shared_constructor,
-#     dtype_=aesara.config.floatX,
-#     get_value_borrow_true_alias_=False,
-#     shared_borrow_true_alias_=False,
-#     set_value_borrow_true_alias_=False,
-#     set_value_inplace_=True,
-#     set_cast_value_inplace_=True,
-#     shared_constructor_accept_ndarray_=True,
-#     internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
-#                                          cls=pygpu._array.ndgpuarray),
-#     test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
-#     aesara_fct_=aesara.tensor.exp,
-#     ref_fct_=np.exp,
-#     cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
-#                                       cls=pygpu._array.ndgpuarray))
-# class TestSharedOptions2(object):
-#     pass
-
-
-def test_set_value_non_contiguous():
-    s = gpuarray_shared_constructor(np.asarray([[1.0, 2.0], [1.0, 2.0], [5, 6]]))
-    s.set_value(s.get_value(borrow=True, return_internal_type=True)[::2], borrow=True)
-    assert not s.get_value(borrow=True, return_internal_type=True).flags["C_CONTIGUOUS"]
-    # In the past, this failed
-    s.set_value([[0, 0], [1, 1]])