Remove aesara.gpuarray

2a5fc594 · Maxim Kochurov · Brandon T. Willard · b3ce3640 · b3ce3640 · b3ce3640
--- a/aesara/gpuarray/__init__.py
+++ b/aesara/gpuarray/__init__.py
-import logging
-import os
-import sys
-import warnings
-import aesara
-from aesara.compile import optdb
-from aesara.configdefaults import config
-from aesara.tensor.basic import register_transfer
-_logger_name = "aesara.gpuarray"
-_logger = logging.getLogger(_logger_name)
-error = _logger.error
-info = _logger.info
-pygpu_activated = False
-# Used to skip initialization checking when we are in the same processus.
-aesara_gpu_is_already_active = False
-try:
-    import pygpu
-    import pygpu.gpuarray
-except ImportError:
-    pygpu = None
-from aesara.gpuarray import (
-    ctc,
-    dnn,
-    extra_ops,
-    fft,
-    multinomial,
-    opt,
-    reduction,
-    rng_mrg,
-    sort,
-)
-from aesara.gpuarray.basic_ops import as_gpuarray_variable
-# This is for documentation not to depend on the availability of pygpu
-from aesara.gpuarray.type import (
-    ContextNotDefined,
-    GpuArrayConstant,
-    GpuArraySharedVariable,
-    GpuArrayType,
-    GpuArrayVariable,
-    get_context,
-    gpuarray_shared_constructor,
-    reg_context,
-)
-def transfer(x, target):
-    try:
-        get_context(target)
-        return as_gpuarray_variable(x, target)
-    except ContextNotDefined:
-        pass
-register_transfer(transfer)
-def pygpu_parse_version(version_string):
-    from collections import namedtuple
-    version_type = namedtuple(
-        "version_type", ("major", "minor", "patch", "fullversion")
-    )
-    pieces = version_string.split(".", 2)
-    assert len(pieces) == 3, version_string
-    major = int(pieces[0])
-    minor = int(pieces[1])
-    if "+" in pieces[2]:  # It contain a git commit.
-        patch = int(pieces[2].split("+", 1)[0])
-    else:  # Maybe it end with .devN
-        patch = int(pieces[2].split(".", 1)[0])
-    fullversion = f"{int(major)}.{int(minor)}.{pieces[2]}"
-    return version_type(major=major, minor=minor, patch=patch, fullversion=fullversion)
-def init_dev(dev, name=None, preallocate=None):
-    global pygpu_activated
-    global aesara_gpu_is_already_active
-    if (
-        not aesara_gpu_is_already_active
-        and os.environ.get("AESARA_GPU_IS_ALREADY_ACTIVE", "") == "Yes"
-    ):
-        raise RuntimeError(
-            "You can't initialize the GPU in a subprocess if the parent process already did it"
-        )
-    if not config.cxx:
-        raise RuntimeError("The new gpu-backend need a c++ compiler.")
-    pygpu_version = pygpu_parse_version(pygpu.__version__)
-    if pygpu_version.major != 0 or pygpu_version.minor != 7 or pygpu_version.patch < 0:
-        raise ValueError(
-            "Your installed version of pygpu(%s) is too old, please upgrade to 0.7.0 or later (but below 0.8.0)"
-            % pygpu_version.fullversion
-        )
-    # This is for the C headers API, we need to match the exact version.
-    gpuarray_version_major_supported = 2
-    gpuarray_version_major_detected = pygpu.gpuarray.api_version()[0]
-    if gpuarray_version_major_detected != gpuarray_version_major_supported:
-        raise ValueError(
-            "Your installed version of libgpuarray is not in sync with the current Aesara"
-            f" version. The installed libgpuarray version supports API version {int(gpuarray_version_major_detected)},"
-            f" while current Aesara supports API version {int(gpuarray_version_major_supported)}. Change the version of"
-            " libgpuarray or Aesara to fix this problem.",
-        )
-    if dev not in init_dev.devmap:
-        args = dict()
-        if config.gpuarray__cache_path != "":
-            args["kernel_cache_path"] = config.gpuarray__cache_path
-        if preallocate is None:
-            preallocate = config.gpuarray__preallocate
-        if preallocate < 0:
-            args["max_cache_size"] = 0
-        else:
-            args["initial_cache_size"] = preallocate
-        context = pygpu.init(
-            dev,
-            sched=config.gpuarray__sched,
-            single_stream=config.gpuarray__single_stream,
-            **args,
-        )
-        os.environ["AESARA_GPU_IS_ALREADY_ACTIVE"] = "Yes"
-        aesara_gpu_is_already_active = True
-        context.dev = dev
-        init_dev.devmap[dev] = context
-        reg_context(name, context)
-        MB = 1024 * 1024
-        if dev.startswith("cuda"):
-            avail = dnn.dnn_available(name)
-            # If we try to enable cudnn and there isn't enough GPU
-            # memory, there will be an unclear error message. So do
-            # not even try a clear error.
-            if avail and context.free_gmem < 75 * MB:
-                raise RuntimeError(
-                    f"Can not enable cuDNN as there is only {int(context.free_gmem / MB)} MB of free GPU memory."
-                )
-            elif avail:
-                context.cudnn_handle = dnn._make_handle(context)
-            elif config.dnn__enabled == "True":
-                raise RuntimeError(
-                    "You enabled cuDNN, but we aren't able to use it: %s"
-                    % dnn.dnn_available.msg
-                )
-            if config.print_active_device:
-                if avail:
-                    print(
-                        f"Using cuDNN version {int(dnn.version())} on context {name}",
-                        file=sys.stderr,
-                    )
-                else:
-                    print(
-                        f"Can not use cuDNN on context {name}: {dnn.dnn_available.msg}",
-                        file=sys.stderr,
-                    )
-        if preallocate < 0:
-            print(f"Disabling allocation cache on {dev}")
-        elif preallocate > 0:
-            if preallocate <= 1:
-                gmem = min(preallocate, 0.95) * context.total_gmem
-            else:
-                gmem = preallocate * MB
-            if gmem > context.free_gmem:
-                raise RuntimeError(
-                    f"Trying to preallocate {int(gmem / MB)} MB of GPU memory while only"
-                    f" {int(context.free_gmem / MB)} MB are available."
-                )
-            elif gmem > context.free_gmem - 50 * MB:
-                warnings.warn(
-                    "Preallocating too much memory can prevent cudnn and cublas from working properly"
-                )
-            # This will allocate and immediately free an object of size gmem
-            # which will reserve that amount of memory on the GPU.
-            pygpu.empty((gmem,), dtype="int8", context=context)
-            if config.print_active_device:
-                print(
-                    f"Preallocating {int(gmem // MB)}/{int(context.total_gmem // MB)} Mb ({gmem / context.total_gmem}) on {dev}",
-                    file=sys.stderr,
-                )
-        # Initialise the blas kernels.  We do this after the
-        # preallocation to not fragment the heap accidentally.
-        tmp = pygpu.empty((2, 2), dtype="float32", context=context)
-        if dev.startswith("cuda"):
-            # In OpenCL, BLAS isn't always available
-            pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
-        del tmp
-    else:
-        context = init_dev.devmap[dev]
-    # This will map the context name to the real context object.
-    if config.print_active_device:
-        try:
-            unique_id = "(" + context.unique_id + ")"
-        except pygpu.gpuarray.UnsupportedException:
-            unique_id = ""
-        print(
-            f"Mapped name {name} to device {dev}: {context.devname} {unique_id}",
-            file=sys.stderr,
-        )
-    pygpu_activated = True
-# This maps things like 'cuda0' to the context object on that device.
-init_dev.devmap = {}
-def use(
-    device,
-    force=False,
-    default_to_move_computation_to_gpu=True,
-    move_shared_to_gpu=True,
-    preallocate=None,
-):
-    """
-    Error and warning about CUDA should be displayed only when this
-    function is called. We need to be able to load this module only
-    to check if it is available!
-    Parameters
-    ----------
-    device : string
-        "cuda", "cuda0", "cudaN", "" (N is the device number to use).
-        "" mean do all the rest and don't init a device.
-    force
-        Will always raise an exception if we can't use the gpu.
-    default_to_move_computation_to_gpu
-        If gpu init succeeded, enable by default optimizations to move
-        computations to the gpu.
-    move_shared_to_gpu
-        If gpu init succeeded, put new shared variables on the gpu.
-    preallocate
-        If specified, will use this value for preallocation instead of
-        gpuarray__preallocate.
-    """
-    if force:
-        if not (device.startswith("cuda") or device.startswith("opencl")):
-            raise Exception("forced the init and bad device provided: " + device)
-        else:
-            # If we force, the device should not already be initialized.
-            assert device not in init_dev.devmap
-    if device:
-        init_dev(device, preallocate=preallocate)
-    if default_to_move_computation_to_gpu:
-        optdb.add_tags("gpuarray_opt", "fast_run", "fast_compile")
-        optdb.add_tags("gpua_scanOp_make_inplace", "fast_run")
-    if move_shared_to_gpu:
-        import aesara.compile
-        aesara.compile.shared_constructor(gpuarray_shared_constructor)
-if pygpu:
-    try:
-        if config.device.startswith("cuda") or config.device.startswith("opencl"):
-            use(config.device)
-        elif config.init_gpu_device.startswith(
-            "cuda"
-        ) or config.init_gpu_device.startswith("opencl"):
-            if config.device != "cpu":
-                raise ValueError("you must set device=cpu to use init_gpu_device.")
-            if config.contexts != "":
-                print(
-                    "Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want."
-                )
-            init_dev(config.init_gpu_device)
-        if config.contexts != "":
-            for n, d in (c.split("->") for c in config.contexts.split(";")):
-                init_dev(d.strip(), n.strip())
-            # To have shared var default on the GPU and opt to move to the GPU.
-            use("")
-    except Exception:
-        error("Could not initialize pygpu, support disabled", exc_info=True)
-    from .basic_ops import (
-        GpuAlloc,
-        GpuAllocEmpty,
-        GpuContiguous,
-        GpuEye,
-        GpuFromHost,
-        GpuJoin,
-        GpuReshape,
-        GpuSplit,
-        HostFromGpu,
-        host_from_gpu,
-    )
-    from .elemwise import GpuElemwise
-    from .subtensor import GpuAdvancedIncSubtensor1, GpuIncSubtensor, GpuSubtensor
-else:
-    if (
-        config.init_gpu_device.startswith("cuda")
-        or config.init_gpu_device.startswith("opencl")
-        or config.device.startswith("opencl")
-        or config.device.startswith("cuda")
-        or config.contexts != ""
-    ):
-        error(
-            "pygpu was configured but could not be imported or is too old (version 0.7 or higher required)",
-            exc_info=True,
-        )
--- a/aesara/gpuarray/basic_ops.py
+++ b/aesara/gpuarray/basic_ops.py
-import copy
-import os
-import re
-from collections import deque
-from typing import Union
-import numpy as np
-import aesara
-import aesara.tensor as at
-from aesara.configdefaults import config
-from aesara.gradient import grad_undefined
-from aesara.graph.basic import Apply, Variable
-from aesara.graph.op import Op, _NoPythonOp
-from aesara.graph.opt import copy_stack_trace
-from aesara.graph.utils import MethodNotDefined
-from aesara.link.c.interface import HideC
-from aesara.link.c.op import COp, ExternalCOp
-from aesara.link.c.params_type import ParamsType
-from aesara.link.c.type import CType
-from aesara.scalar import bool as bool_t
-from aesara.scalar import int32 as int32_t
-from aesara.tensor.basic import Alloc, AllocEmpty, Join, Split, infer_broadcastable
-from aesara.tensor.shape import Reshape
-from aesara.tensor.type import TensorType, values_eq_approx_always_true
-try:
-    import pygpu
-    from pygpu import gpuarray
-except ImportError:
-    pass
-from aesara.gpuarray.fp16_help import write_w
-from aesara.gpuarray.type import (
-    EQ_MAP,
-    ContextNotDefined,
-    GpuArrayConstant,
-    GpuArrayType,
-    GpuContextType,
-    get_context,
-    gpu_context_type,
-)
-def as_gpuarray_variable(x, context_name):
-    """
-    This will attempt to convert `x` into a variable on the GPU.
-    It can take either a value of another variable.  If `x` is already
-    suitable, it will be returned as-is.
-    Parameters
-    ----------
-    x
-        Object to convert
-    context_name : str or None
-        target context name for the result
-    """
-    # If this is already some form of variable, try to avoid an extra transfer
-    if isinstance(x, Variable):
-        while True:
-            # If we are already a GpuArrayVariable in the right context
-            # then there is nothing to do.
-            if isinstance(x.type, GpuArrayType) and x.type.context_name == context_name:
-                return x
-            # If x is the result of a transfer, try to dig through.
-            if getattr(x, "owner", None):
-                if isinstance(x.owner.op, HostFromGpu):
-                    x = x.owner.inputs[0]
-                    continue
-                if isinstance(x.owner.op, GpuFromHost):
-                    x = x.owner.inputs[0]
-                    continue
-                if isinstance(x.owner.op, GpuToGpu):
-                    x = x.owner.inputs[0]
-                    continue
-            # If none of the conditions where met, then continue with
-            # the rest of the body
-            break
-        # If we couldn't deal with transfers, then maybe it's a tensor
-        if isinstance(x.type, TensorType):
-            return copy_stack_trace(x, GpuFromHost(context_name)(x))
-    # Try _as_GpuArrayVariable if possible
-    if hasattr(x, "_as_GpuArrayVariable"):
-        return copy_stack_trace(x, x._as_GpuArrayVariable(context_name))
-    # If it didn't work try for a constant
-    ctx = get_context(context_name)
-    if isinstance(x, gpuarray.GpuArray):
-        if x.context.ptr != ctx.ptr:
-            x = x.transfer(ctx)
-    x = gpuarray.asarray(x, context=ctx)
-    bcast = [(s == 1) for s in x.shape]
-    return GpuArrayConstant(
-        GpuArrayType(dtype=x.dtype, broadcastable=bcast, context_name=context_name), x
-    )
-def infer_context_name(*vars):
-    """
-    Infer the context name to use from the inputs given
-    """
-    # We try to infer the closest context first
-    # TODO: What to do in case of context conflicts?
-    #       We currently use a first found wins approach.
-    todo = deque()
-    todo.extendleft(vars)
-    while todo:
-        v = todo.pop()
-        if isinstance(v.type, GpuArrayType):
-            return v.type.context_name
-        if hasattr(v.tag, "context_name"):
-            return v.tag.context_name
-        if v.owner:
-            if isinstance(v.owner.op, HostFromGpu):
-                return v.owner.inputs[0].type.context_name
-            if len(v.owner.inputs) == 1:
-                todo.extendleft(v.owner.inputs)
-    # If we can't find a context try None if it exists
-    try:
-        get_context(None)
-        return None
-    except ContextNotDefined:
-        raise ValueError("Could not infer context from inputs")
-def gpuarray_helper_inc_dir():
-    return os.path.join(os.path.dirname(__file__), "c_code")
-class Kernel:
-    """
-    This class groups together all the attributes of a gpu kernel.
-    `params` should contain the data type for each argument.  Buffer
-    arguments should use the GpuArray class as the data type and
-    scalar should use their equivalent numpy dtype.  For ga_size and
-    ga_ssize, use gpuarray.SIZE and gpuarray.SSIZE.
-    If the `ctypes` flags is set to `True` then it should be a C
-    string which represent the typecode to use.
-    `flags` can contain the following keys whose values are booleans:
-        have_double
-            the kernel uses double-typed variables somewhere
-        have_small
-            the kernel uses variables whose type takes less than 4
-            bytes somewhere
-        have_complex
-            the kernel uses complex values somewhere
-        have_half
-            the kernel uses half-floats somewhere
-        ctypes
-            the `params` list consists of C typecodes
-    It can also have the key `cflags` which is a string of C flag
-    values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
-    Parameters
-    ----------
-    code: str
-        The source code of the kernel.
-    params: list
-        list of parameter types.
-    name: str
-        the name of the kernel function in the source.
-    flags: dict
-        dictionary of flags
-    codevar: str
-        the name of the variable for the code object.
-        (defaults to `kcode_` + name)
-    objvar: str
-        the name of the variable for the kernel object.
-        (defaults to `k_` + name)
-    fname: str
-        the name of the function wrapper.
-        (defaults to name + `_call`)
-    sname: str
-        the name of the scheduled call function
-        (defaults to name _ `_scall`)
-    """
-    def __init__(
-        self,
-        code,
-        params,
-        name,
-        flags,
-        codevar=None,
-        objvar=None,
-        fname=None,
-        sname=None,
-    ):
-        self.code = code
-        self.params = params
-        self.name = name
-        self.flags = flags
-        if codevar is None:
-            codevar = "kcode_" + name
-        self.codevar = codevar
-        if objvar is None:
-            objvar = "k_" + name
-        self.objvar = objvar
-        if fname is None:
-            fname = name + "_call"
-        self.fname = fname
-        if sname is None:
-            sname = name + "_scall"
-        self.sname = sname
-    @staticmethod
-    def get_flags(*types):
-        def get_dtype(t):
-            if isinstance(t, str):
-                return np.dtype(t)
-            elif isinstance(t, CType):
-                return t.dtype
-            elif isinstance(t, Variable):
-                return t.type.dtype
-            else:
-                raise TypeError(f"can't get a dtype from {type(t)}")
-        dtypes = [get_dtype(t) for t in types]
-        flags = dict()
-        if any(d == np.float64 for d in dtypes):
-            flags["have_double"] = True
-        if any(d.itemsize < 4 for d in dtypes):
-            flags["have_small"] = True
-        if any(d.kind == "c" for d in dtypes):
-            flags["have_complex"] = True
-        if any(d == np.float16 for d in dtypes):
-            flags["have_half"] = True
-        return flags
-    def _get_c_flags(self):
-        res = []
-        if self.flags.get("cflags", "") != "":
-            res.append(self.flags["cflags"])
-        if self.flags.get("have_double", False):
-            res.append("GA_USE_DOUBLE")
-        if self.flags.get("have_small", False):
-            res.append("GA_USE_SMALL")
-        if self.flags.get("have_complex", False):
-            res.append("GA_USE_COMPLEX")
-        if self.flags.get("have_half", False):
-            res.append("GA_USE_HALF")
-        res = "|".join(res)
-        if not res:
-            return "0"
-        return res
-    def _get_py_flags(self):
-        res = dict(self.flags)
-        cflags = res.pop("cflags", "")
-        for fl in cflags.split("|"):
-            fl = fl.strip()
-            if fl == "GA_USE_DOUBLE":
-                res["have_double"] = True
-            if fl == "GA_USE_SMALL":
-                res["have_small"] = True
-            if fl == "GA_USE_COMPLEX":
-                res["have_complex"] = True
-            if fl == "GA_USE_HALF":
-                res["have_half"] = True
-        return res
-    def _get_c_types(self):
-        def m(t):
-            if t == gpuarray.GpuArray:
-                return "GA_BUFFER"
-            else:
-                return str(gpuarray.dtype_to_typecode(t))
-        return ", ".join(m(t) for t in self.params)
-def get_ctype(dtype):
-    if dtype is gpuarray.GpuArray:
-        return "gpudata *"
-    elif isinstance(dtype, np.dtype):
-        return "npy_" + dtype.name
-    elif dtype == gpuarray.SIZE:
-        return "size_t"
-    elif dtype == gpuarray.SSIZE:
-        return "ssize_t"
-    else:
-        dtype = np.dtype(dtype)
-        return "npy_" + dtype.name
-class GpuKernelBase:
-    """
-    Base class for operations that need to compile kernels.
-    It is not mandatory to use this class, but it helps with a lot of
-    the small things that you have to pay attention to.
-    """
-    params_type: Union[ParamsType, GpuContextType] = gpu_context_type
-    def get_params(self, node):
-        # Default implementation, suitable for most sub-classes.
-        # To be necessarly overridden in a subclass that uses a ParamsType.
-        assert (
-            self.params_type is gpu_context_type
-            and node.inputs
-            and isinstance(node.inputs[0].type, GpuArrayType)
-        )
-        return node.inputs[0].type.context
-    def get_gpu_context(self, node):
-        # Private method used to retrieve GPU context, instead of
-        # directly using self.get_params(node), as this latter may be overridden.
-        if isinstance(self.params_type, ParamsType) and self.params_type.has_type(
-            gpu_context_type
-        ):
-            # Get field name of gpu_context_type into ParamsType object.
-            gpu_context_field = self.params_type.get_field(gpu_context_type)
-            # Get Params object (self.get_params() should have been overridden).
-            wrap = self.get_params(node)
-            # Get GPU context from Params object.
-            return getattr(wrap, gpu_context_field)
-        assert self.params_type is gpu_context_type
-        return self.get_params(node)
-    def get_gpu_context_c_name(self, params_c_name):
-        # Private method used to retrieve C name of GPU context variable,
-        # instead of directly using sub['params'], as params may not be a GPU context
-        # (e.g. for sub-classes that use ParamsType).
-        if isinstance(self.params_type, ParamsType) and self.params_type.has_type(
-            gpu_context_type
-        ):
-            return f"({params_c_name}->{self.params_type.get_field(gpu_context_type)})"
-        assert self.params_type is gpu_context_type
-        return params_c_name
-    def gpu_kernels(self, node, name):
-        """
-        This is the method to override. This should return an iterable
-        of Kernel objects that describe the kernels this op will need.
-        """
-        raise MethodNotDefined("gpu_kernels")
-    def c_headers(self, **kwargs):
-        try:
-            o = super().c_headers(**kwargs)
-        except MethodNotDefined:
-            o = []
-        return o + ["gpuarray/types.h", "numpy/npy_common.h"]
-    def c_header_dirs(self, **kwargs):
-        try:
-            o = super().c_header_dirs(**kwargs)
-        except MethodNotDefined:
-            o = []
-        # We rely on the input types for the directory to gpuarray includes
-        return o + [np.get_include()]
-    def _generate_kernel_code(self, k):
-        code = "\\n".join(l for l in k.code.split("\n"))
-        code = code.replace('"', '\\"')
-        return """static const char *%(cname)s_unsigned = "%(code)s";
-                static const char *%(cname)s = (char *)%(cname)s_unsigned;
-                """ % dict(
-            cname=k.codevar, code=code
-        )
-    def _generate_kernel_vars(self, k):
-        return f"""GpuKernel {k.objvar};"""
-    def _generate_kernel_wrap(self, k):
-        args = []
-        setargs = []
-        for i, p in enumerate(k.params):
-            args.append(f"{get_ctype(p)} arg{i}")
-            if p is gpuarray.GpuArray:
-                setarg = "GpuKernel_setarg(&{0}, {1}, arg{1});"
-            else:
-                setarg = "GpuKernel_setarg(&{0}, {1}, &arg{1});"
-            setargs.append(setarg.format(k.objvar, i))
-        args = ", ".join(args)
-        setargs = "\n  ".join(setargs)
-        return """
-int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
-                  {args}) {{
-  {setargs}
-  return GpuKernel_call(&{kname}, _nd, _gdim, _ldim, _shared, NULL);
-}}
-int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
-  size_t _gs = 0;
-  size_t _ls = 0;
-  int _err;
-  if (_nd != 1) return GA_UNSUPPORTED_ERROR;
-  _err = GpuKernel_sched(&{kname}, _n[0], &_gs, &_ls);
-  if (_err != GA_NO_ERROR)
-    return _err;
-  {setargs}
-  return GpuKernel_call(&{kname}, 1, &_gs, &_ls, _shared, NULL);
-}}
-        """.format(
-            args=args, fname=k.fname, setargs=setargs, sname=k.sname, kname=k.objvar
-        )
-    def c_support_code_apply(self, node, name):
-        kernels = self.gpu_kernels(node, name)
-        codes = "\n".join(self._generate_kernel_code(k) for k in kernels)
-        return codes
-    def c_support_code_struct(self, node, name):
-        kernels = self.gpu_kernels(node, name)
-        kvars = "\n".join(self._generate_kernel_vars(k) for k in kernels)
-        wrappers = "\n".join(self._generate_kernel_wrap(k) for k in kernels)
-        return kvars + "\n" + wrappers
-    def _generate_zeros(self, k):
-        return f"""memset(&{k.objvar}, 0, sizeof({k.objvar}));"""
-    def _generate_kernel_init(self, k, fail, ctx):
-        return """{
-  int err;
-  int types[%(numargs)u] = {%(types)s};
-  if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ctx, 1,
-                            &%(cname)s, NULL, "%(kname)s", %(numargs)u,
-                            types, %(flags)s, NULL)) != GA_NO_ERROR) {
-    PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
-                 err, gpucontext_error(%(ctx)s->ctx, err));
-    %(fail)s
-  }
-}""" % dict(
-            numargs=len(k.params),
-            types=k._get_c_types(),
-            ovar=k.objvar,
-            kname=k.name,
-            cname=k.codevar,
-            flags=k._get_c_flags(),
-            fail=fail,
-            ctx=ctx,
-        )
-    def c_init_code_struct(self, node, name, sub):
-        ctx = self.get_gpu_context_c_name(sub["params"])
-        kernels = self.gpu_kernels(node, name)
-        inits_0 = "\n".join(self._generate_zeros(k) for k in kernels)
-        inits = "\n".join(
-            self._generate_kernel_init(k, sub["fail"], ctx) for k in kernels
-        )
-        return "\n".join([inits_0, inits])
-    def _generate_kernel_cleanup(self, k):
-        return f"GpuKernel_clear(&{k.objvar});"
-    def c_cleanup_code_struct(self, node, name):
-        kernels = self.gpu_kernels(node, name)
-        cleanups = "\n".join(self._generate_kernel_cleanup(k) for k in kernels)
-        return cleanups
-    # This is a shorthand for if your op only has a fixed version
-    # You can reimplement it, but make sure to call kernel_version()
-    def c_code_cache_version_apply(self, node):
-        v = self.c_code_cache_version()
-        if not v:
-            return ()
-        return (v, self.kernel_version(node))
-    def kernel_version(self, node):
-        """
-        If you override :meth:`c_code_cache_version_apply`, call this
-        method to have the version of the kernel support code.
-        Parameters
-        ----------
-        node : apply node
-            The node that we need the cache version for.
-        """
-        return (9,)
-class GpuKernelBaseCOp(GpuKernelBase, COp):
-    pass
-class GpuKernelBaseExternalCOp(GpuKernelBase, ExternalCOp):
-    pass
-def forward_string_meth(name):
-    def f(*args):
-        res = getattr(GpuKernelBase, name)(*args)
-        try:
-            res = res + "\n" + getattr(ExternalCOp, name)(*args)
-        except MethodNotDefined:
-            pass
-        return res
-    f.__name__ = name
-    return f
-def get_dtype(s):
-    if s == "*":
-        return gpuarray.GpuArray
-    if s == "size":
-        return gpuarray.SIZE
-    if s == "ssize":
-        return gpuarray.SSIZE
-    else:
-        return np.dtype(s)
-class CGpuKernelBase(GpuKernelBaseExternalCOp, _NoPythonOp):
-    """
-    Class to combine GpuKernelBase and ExternalCOp.
-    It adds a new section type 'kernels' where you can define kernels
-    with the '#kernel' tag
-    """
-    SECTIONS = copy.copy(ExternalCOp.SECTIONS)
-    SECTIONS.add("kernels")
-    kernel_re = re.compile(r"^#kernel ([a-zA-Z_].*?)$", re.MULTILINE)
-    get_params = GpuKernelBase.get_params
-    c_support_code_apply = forward_string_meth("c_support_code_apply")
-    c_support_code_struct = forward_string_meth("c_support_code_struct")
-    c_init_code_struct = forward_string_meth("c_init_code_struct")
-    c_cleanup_code_struct = forward_string_meth("c_cleanup_code_struct")
-    def c_code_cache_version_apply(self, node):
-        return GpuKernelBase.c_code_cache_version_apply(self, node)
-    def _type_macros(self, node):
-        define_template = "#define %s %s\n"
-        undef_template = "#undef %s\n"
-        define_macros = []
-        undef_macros = []
-        for i, v in enumerate(node.inputs):
-            if isinstance(v.type, GpuArrayType):
-                macro_name = f"DTYPE_INPUT_{i}"
-                macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
-                define_macros.append(define_template % (macro_name, macro_value))
-                undef_macros.append(undef_template % macro_name)
-        for i, v in enumerate(node.outputs):
-            if isinstance(v.type, GpuArrayType):
-                macro_name = f"DTYPE_OUTPUT_{i}"
-                macro_value = pygpu.gpuarray.dtype_to_ctype(v.dtype)
-                define_macros.append(define_template % (macro_name, macro_value))
-                undef_macros.append(undef_template % macro_name)
-        return "".join(define_macros), "".join(undef_macros)
-    def gpu_kernels(self, node, name):
-        if hasattr(self, "_cached_kernels"):
-            return self._cached_kernels
-        if "kernels" in self.code_sections:
-            code = self.code_sections["kernels"]
-            split = self.kernel_re.split(code)
-            if split[0].strip() != "":
-                raise ValueError(
-                    "Stray code in kernels section before the "
-                    "first #kernel statement."
-                )
-            def_macros, undef_macros = self._type_macros(node)
-            n = 1
-            res = []
-            while n < len(split):
-                kspec = split[n]
-                kcode = split[n + 1]
-                splt2 = kspec.split(":")
-                if len(splt2) != 3:
-                    raise ValueError(f"Bad kernel spec: {kspec}")
-                kname = splt2[0].strip()
-                ktypes = [get_dtype(s.strip()) for s in splt2[1].split(",")]
-                kflags = splt2[2].strip()
-                kcode = def_macros + "\n" + kcode + "\n" + undef_macros
-                res.append(Kernel(kcode, ktypes, kname, flags=dict(cflags=kflags)))
-                n += 2
-            self._cached_kernels = res
-            return res
-        else:
-            return GpuKernelBase.gpu_kernels(self, node, name)
-class HostFromGpu(COp):
-    """
-    Transfer data to CPU.
-    """
-    __props__ = ()
-    _f16_ok = True
-    def __str__(self):
-        return "HostFromGpu(gpuarray)"
-    def make_node(self, x):
-        if not isinstance(x.type, GpuArrayType):
-            raise TypeError(x)
-        out_var = TensorType(dtype=x.dtype, broadcastable=x.broadcastable)()
-        # Keep the special comparison if there is one.
-        values_eq_approx = getattr(x.tag, "values_eq_approx", None)
-        if values_eq_approx:
-            out_var.tag.values_eq_approx = EQ_MAP.get(
-                values_eq_approx, values_eq_approx
-            )
-        return Apply(self, [x], [out_var])
-    def perform(self, node, inp, out):
-        (x,) = inp
-        (z,) = out
-        z[0] = np.asarray(x)
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        GpuArray %(name)s_ga_s;
-        GpuArray *%(name)s_ga = NULL;
-        int %(name)serr;
-        PyArray_Descr *%(name)s_dtype;
-        if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) {
-            if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) {
-                PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
-                %(fail)s;
-            }
-            %(name)s_ga = &%(name)s_ga_s;
-        } else {
-            %(name)s_ga = &%(inp)s->ga;
-        }
-        %(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode);
-        Py_XDECREF(%(out)s);
-        // PyArray_Empty below steals a reference to the dtype we pass it
-        // so we need an extra one to spare.
-        Py_INCREF(%(name)s_dtype);
-        %(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd,
-                                (npy_intp *)%(inp)s->ga.dimensions,
-                                %(name)s_dtype,
-                                (%(inp)s->ga.flags & GA_F_CONTIGUOUS) &&
-                                !(%(inp)s->ga.flags & GA_C_CONTIGUOUS));
-        if (%(out)s == NULL) {
-            if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
-            %(fail)s
-        }
-        Py_BEGIN_ALLOW_THREADS
-        %(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
-                                    PyArray_NBYTES(%(out)s),
-                                    %(name)s_ga);
-        Py_END_ALLOW_THREADS
-        if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
-        if (%(name)serr != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
-            %(fail)s
-        }
-        """ % {
-            "name": name,
-            "fail": sub["fail"],
-            "inp": inputs[0],
-            "out": outputs[0],
-        }
-    def c_code_cache_version(self):
-        return (2,)
-    def grad(self, inputs, grads):
-        (gz,) = grads
-        return [GpuFromHost(inputs[0].type.context_name)(gz)]
-    def R_op(self, inputs, eval_points):
-        (ev,) = eval_points
-        return [self(ev)]
-    def infer_shape(self, fgraph, node, xshp):
-        return xshp
-host_from_gpu = HostFromGpu()
-class GpuFromHost(COp):
-    """
-    Transfer data to GPU.
-    """
-    __props__ = ("context_name",)
-    _f16_ok = True
-    params_type = gpu_context_type
-    def __init__(self, context_name):
-        self.context_name = context_name
-    def __str__(self):
-        return f"GpuFromHost<{self.context_name}>"
-    def make_node(self, x):
-        if not isinstance(x.type, TensorType):
-            raise TypeError(x)
-        if "complex" in x.dtype:
-            raise TypeError("complex not supported in the new gpuarray back-end.", x)
-        out_var = GpuArrayType(
-            broadcastable=x.broadcastable, context_name=self.context_name, dtype=x.dtype
-        )()
-        # Keep the special comparison if there is one.
-        values_eq_approx = getattr(x.tag, "values_eq_approx", None)
-        if values_eq_approx:
-            out_var.tag.values_eq_approx = EQ_MAP.get(
-                values_eq_approx, values_eq_approx
-            )
-        return Apply(self, [x], [out_var])
-    def get_params(self, node):
-        return get_context(self.context_name)
-    def perform(self, node, inp, out, ctx):
-        (x,) = inp
-        (z,) = out
-        z[0] = gpuarray.array(x, context=ctx)
-    def grad(self, inputs, grads):
-        (gz,) = grads
-        return [
-            as_gpuarray_variable(gz, context_name=self.context_name).transfer("cpu")
-        ]
-    def R_op(self, inputs, eval_points):
-        (ev,) = eval_points
-        return [self(ev)]
-    def infer_shape(self, fgraph, node, xshp):
-        return xshp
-    def c_headers(self, **kwargs):
-        return ["gpuarray_helper.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        PyArrayObject *%(name)s_tmp;
-        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
-        int err;
-        if (%(name)s_tmp == NULL)
-          %(fail)s
-        if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
-            !aesara_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
-                               (size_t *)PyArray_DIMS(%(name)s_tmp),
-                               get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
-          Py_XDECREF(%(out)s);
-          %(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp),
-                                (size_t *)PyArray_DIMS(%(name)s_tmp),
-                                get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
-                                GA_C_ORDER, %(ctx)s, Py_None);
-          if (%(out)s == NULL) {
-            Py_DECREF(%(name)s_tmp);
-            %(fail)s;
-          }
-        }
-        Py_BEGIN_ALLOW_THREADS
-        err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
-                             PyArray_NBYTES(%(name)s_tmp));
-        Py_END_ALLOW_THREADS
-        Py_DECREF(%(name)s_tmp);
-        if (err != GA_NO_ERROR) {
-          PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
-          %(fail)s;
-        }
-        """ % {
-            "name": name,
-            "inp": inputs[0],
-            "ctx": sub["params"],
-            "out": outputs[0],
-            "fail": sub["fail"],
-        }
-    def c_code_cache_version(self):
-        return (10,)
-class GpuToGpu(COp):
-    """
-    Transfer data between GPUs.
-    """
-    __props__ = ("context_name",)
-    _f16_ok = True
-    params_type = gpu_context_type
-    def __init__(self, context_name):
-        self.context_name = context_name
-    def __str__(self):
-        return f"GpuToGpu<{self.context_name}>"
-    def make_node(self, x):
-        if not isinstance(x.type, GpuArrayType):
-            raise TypeError(x)
-        return Apply(
-            self,
-            [x],
-            [
-                GpuArrayType(
-                    broadcastable=x.broadcastable,
-                    context_name=self.context_name,
-                    dtype=x.dtype,
-                )()
-            ],
-        )
-    def get_params(self, node):
-        return get_context(self.context_name)
-    def perform(self, node, inp, out, ctx):
-        (x,) = inp
-        (z,) = out
-        z[0] = x.transfer(ctx)
-    def grad(self, inputs, grads):
-        (gz,) = grads
-        return [GpuToGpu(inputs[0].type.context_name)(gz)]
-    def R_op(self, inputs, eval_points):
-        return self(eval_points[0])
-    def infer_shape(self, fgraph, node, xshp):
-        return xshp
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_empty(%(inp)s->ga.nd,
-                              %(inp)s->ga.dimensions,
-                              %(inp)s->ga.typecode,
-                              GpuArray_IS_C_CONTIGUOUS(&(%(inp)s->ga)) ? GA_C_ORDER:GA_F_ORDER,
-                              %(ctx)s, Py_None);
-        if (%(out)s == NULL) {
-            %(fail)s
-        }
-        if (pygpu_transfer(%(out)s, %(inp)s)) {
-            %(fail)s
-        }
-        """ % {
-            "inp": inputs[0],
-            "ctx": sub["params"],
-            "out": outputs[0],
-            "fail": sub["fail"],
-        }
-    def c_code_cache_version(self):
-        return (1,)
-class GpuAlloc(HideC, Alloc):
-    """
-    Allocate initialized memory on the GPU.
-    Parameters
-    ----------
-    context_name : str
-        The name of the context in which to allocate memory
-    memset_0 : bool
-        It's only an optimized version. True, it means the
-        value is always 0, so the c code call memset as it is faster.
-    """
-    __props__ = ("memset_0", "context_name")
-    _f16_ok = True
-    params_type = ParamsType(context=gpu_context_type, memset_0=bool_t)
-    def __init__(self, context_name, memset_0=False):
-        self.context_name = context_name
-        self.memset_0 = memset_0
-    def get_params(self, node):
-        return self.params_type.get_params(
-            context=get_context(self.context_name), memset_0=self.memset_0
-        )
-    def __str__(self):
-        # Hide the memset parameter when not used to prevent confusion.
-        if self.memset_0:
-            m = "{memset_0=True}"
-        else:
-            m = ""
-        return f"{self.__class__.__name__}<{self.context_name}>{m}"
-    def make_node(self, value, *shape):
-        value = as_gpuarray_variable(value, context_name=self.context_name)
-        sh, bcast = infer_broadcastable(shape)
-        if value.ndim > len(sh):
-            TypeError(
-                "The GpuAlloc value to use has more dimensions "
-                "than the specified shape",
-                value.ndim,
-                len(sh),
-            )
-        otype = value.type.clone(broadcastable=bcast)
-        return Apply(self, [value] + sh, [otype()])
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>"]
-    def perform(self, node, inputs, outs, params):
-        (out,) = outs
-        v = inputs[0]
-        sh = tuple(map(int, inputs[1:]))
-        if out[0] is None or out[0].shape != sh:
-            if self.memset_0:
-                out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=params.context)
-            else:
-                out[0] = gpuarray.empty(sh, dtype=v.dtype, context=params.context)
-                out[0][...] = v
-        else:
-            out[0][...] = v
-    def c_code(self, node, name, inp, out, sub):
-        vv = inp[0]
-        ndim = len(inp[1:])
-        (zz,) = out
-        code = """
-        int i;
-        size_t %(name)s_shape[%(ndim)s];
-        """ % dict(
-            name=name, ndim=ndim
-        )
-        for i, shp_i in enumerate(inp[1:]):
-            code += """
-        %(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
-        """ % dict(
-                name=name, i=i, shp_i=shp_i
-            )
-        code += """
-        int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s);
-        if (!need_new_out)
-            for (i = 0; i < %(ndim)s; i++)
-                need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
-        if (need_new_out && (%(params)s->memset_0)) {
-            //pygpu_zeros can be faster then empty followed by memset.
-            Py_XDECREF(%(zz)s);
-            %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
-                                 %(vv)s->ga.typecode, GA_C_ORDER,
-                                 %(params)s->context, Py_None);
-            if (!%(zz)s) {
-                %(fail)s
-            }
-        } else {
-            if (need_new_out) {
-                Py_XDECREF(%(zz)s);
-                %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
-                                     %(vv)s->ga.typecode, GA_C_ORDER,
-                                     %(params)s->context, Py_None);
-                if (!%(zz)s) {
-                    %(fail)s
-                }
-            }
-            if (%(params)s->memset_0 && GpuArray_ISONESEGMENT(&%(zz)s->ga))
-            {
-                int err = GpuArray_memset(&%(zz)s->ga, 0);
-                if (err != GA_NO_ERROR)
-                {
-                    PyErr_Format(PyExc_MemoryError,
-                                 "GpuAlloc: Error memsetting %%llu"
-                                 " element of device memory to 0.",
-                                 (unsigned long long)PyGpuArray_SIZE(%(zz)s));
-                    %(fail)s;
-                }
-            }
-            else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
-                     GA_NO_ERROR) {
-                PyErr_SetString(PyExc_ValueError, "setarray failed");
-                %(fail)s
-            }
-        }
-        """ % dict(
-            name=name, ndim=ndim, zz=zz, vv=vv, params=sub["params"], fail=sub["fail"]
-        )
-        return code
-    def c_code_cache_version(self):
-        return (4,)
-    def do_constant_folding(self, fgraph, node):
-        from . import blas, subtensor
-        for client in fgraph.clients[node.outputs[0]]:
-            if client[0] == "output":
-                # If the output is a constant, it will have to be deepcopied
-                # each time the function is called.  So we do not fold.
-                return False
-            # The following ops work inplace of their input id 0.
-            elif (
-                client[1] == 0
-                and
-                # Ops that will work inplace on the Alloc. So if they
-                # get constant_folded, they would copy the
-                # constant and this is less efficients.
-                # Not doing the constant folding could also lower
-                # the peak memory usage, as we the "constant" won't
-                # always exists.
-                isinstance(
-                    client[0].op,
-                    (
-                        subtensor.GpuIncSubtensor,
-                        subtensor.GpuAdvancedIncSubtensor1,
-                        subtensor.GpuAdvancedIncSubtensor1_dev20,
-                        subtensor.GpuAdvancedIncSubtensor,
-                        blas.GpuGemm,
-                        blas.GpuGemv,
-                        blas.GpuGer,
-                    ),
-                )
-            ):
-                return False
-            # If the clients is a transfer, we don't want to fold. We
-            # let the moving opt finish before deciding what to do.
-            elif isinstance(client[0].op, HostFromGpu):
-                return False
-        return True
-class GpuAllocEmpty(HideC, AllocEmpty):
-    """
-    Allocate uninitialized memory on the GPU.
-    """
-    __props__ = ("dtype", "context_name")
-    _f16_ok = True
-    params_type = ParamsType(context=gpu_context_type, typecode=int32_t)
-    def __init__(self, dtype, context_name):
-        self.dtype = dtype
-        self.context_name = context_name
-    @property
-    def typecode(self):
-        return gpuarray.dtype_to_typecode(self.dtype)
-    def get_params(self, node):
-        return self.params_type.get_params(
-            context=get_context(self.context_name), typecode=self.typecode
-        )
-    def make_node(self, *shape):
-        sh, bcast = infer_broadcastable(shape)
-        output = GpuArrayType(
-            dtype=self.dtype, broadcastable=bcast, context_name=self.context_name
-        )()
-        output.tag.values_eq_approx = values_eq_approx_always_true
-        # The output can contain nan/inf.
-        output.type.filter_checks_isfinite = False
-        output.tag.nan_guard_mode_check = False
-        return Apply(self, sh, [output])
-    def debug_perform(self, node, inputs, out_, params):
-        self.perform(node, inputs, out_, params)
-        out_[0][0][:] = -123456789
-    def perform(self, node, inputs, out_, params):
-        out = out_[0]
-        sh = [int(i) for i in inputs]
-        if out[0] is None or out[0].shape != sh:
-            out[0] = pygpu.empty(sh, dtype=self.dtype, context=params.context)
-        # if out[0] is the right shape, we just return it
-    def c_headers(self, **kwargs):
-        return ["<gpuarray_helper.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_code(self, node, name, inp, out, sub):
-        ndim = len(inp)
-        zz = out[0]
-        fail = sub["fail"]
-        code = [
-            f"""
-int i;
-size_t shape[{ndim}];
-"""
-        ]
-        for i, shp_i in enumerate(inp):
-            code.append(
-                """
-shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
-"""
-                % dict(i=i, shp_i=shp_i)
-            )
-        code.append(
-            """
-if (aesara_prep_output(&%(zz)s, %(ndim)s, shape, %(params)s->typecode, GA_C_ORDER,
-                       %(params)s->context)) {
-  %(fail)s
-}
-"""
-            % dict(zz=zz, ndim=ndim, fail=fail, params=sub["params"])
-        )
-        return "".join(code)
-    def c_code_cache_version(self):
-        return (2,)
-    def do_constant_folding(self, fgraph, node):
-        return False
-    def infer_shape(self, fgraph, node, input_shapes):
-        return [node.inputs]
-    def grad(self, *args):
-        # Don't reuse the grad implementation from Alloc
-        raise NotImplementedError("grad disabled")
-def empty_like(var):
-    return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
-class GpuContiguous(Op):
-    """
-    Return a C contiguous version of the input.
-    This may either pass the object as-is (if already C contiguous) or
-    make a copy.
-    """
-    __props__ = ()
-    view_map = {0: [0]}
-    _f16_ok = True
-    def grad(self, inputs, dout):
-        (x,) = inputs
-        (dout,) = dout
-        dout = as_gpuarray_variable(dout, context_name=infer_context_name(x))
-        return [dout]
-    def make_node(self, input):
-        input = as_gpuarray_variable(input, context_name=infer_context_name(input))
-        return Apply(self, [input], [input.type()])
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_headers(self, **kwargs):
-        return ["<gpuarray_helper.h>"]
-    def c_code_cache_version(self):
-        return (4,)
-    def c_code(self, node, name, inp, out, sub):
-        return """
-        {
-            if (GpuArray_IS_C_CONTIGUOUS(&(%(input)s->ga))) {
-                Py_XDECREF(%(z)s);
-                %(z)s = %(input)s;
-                Py_INCREF(%(z)s);
-            } else if (NULL == %(z)s
-                || !aesara_size_check(%(z)s, PyGpuArray_NDIM(%(input)s), PyGpuArray_DIMS(%(input)s),
-                                      %(input)s->ga.typecode)
-                || !GpuArray_IS_C_CONTIGUOUS(&(%(z)s->ga)))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = pygpu_copy(%(input)s, GA_C_ORDER);
-                if (!%(z)s)
-                {
-                    %(fail)s;
-                }
-            } else if(pygpu_move(%(z)s, %(input)s) == -1) {
-                %(fail)s;
-            }
-        }
-        """ % dict(
-            input=inp[0], z=out[0], fail=sub["fail"]
-        )
-    def perform(self, node, inp, out_):
-        (x,) = inp
-        (out,) = out_
-        out[0] = pygpu.ascontiguousarray(x)
-gpu_contiguous = GpuContiguous()
-class GpuReshape(HideC, Reshape):
-    """
-    Reshape for GPU variables.
-    """
-    _f16_ok = True
-    # __hash__, __eq__, __str__ come from Reshape
-    def make_node(self, x, shp):
-        ctx_name = infer_context_name(x)
-        x = as_gpuarray_variable(x, context_name=ctx_name)
-        shp = at.as_tensor_variable(shp)
-        res = x.transfer("cpu").reshape(shp, ndim=self.ndim)
-        otype = GpuArrayType(
-            dtype=res.dtype, broadcastable=res.broadcastable, context_name=ctx_name
-        )
-        return Apply(self, [x, shp], [otype()])
-    def perform(self, node, inp, out_, params):
-        x, shp = inp
-        (out,) = out_
-        if len(shp) != self.ndim:
-            raise ValueError(
-                "shape argument to GpuReshape.perform"
-                " has incorrect length %i"
-                ", should be %i" % (len(shp), self.ndim),
-                shp,
-            )
-        if shp.prod() != x.size:
-            # We need to do check here to raise the same error as NumPy.
-            # We should make pygpu do the same.
-            ss = 1
-            nb_m1 = 0
-            for i in shp:
-                if i == -1:
-                    nb_m1 += 1
-                else:
-                    ss *= i
-            if nb_m1 > 1:
-                raise ValueError("Only one -1 is accepted in the new shape")
-            elif nb_m1 == 1:
-                if (x.size % ss) != 0:
-                    raise ValueError(
-                        "When using -1 in new shape, the computed new shape must be an multiple of the original shape."
-                    )
-            else:
-                raise ValueError("total size of new array must be unchanged")
-        out[0] = x.reshape(tuple(shp))
-    def c_code_cache_version(self):
-        return (3,)
-    def c_code(self, node, name, inputs, outputs, sub):
-        x, shape = inputs
-        (output,) = outputs
-        sdtype = node.inputs[1].type.dtype_specs()[1]
-        just_fail = sub["fail"]
-        fail = """{
-        free(new_dims);
-        %(just_fail)s
-        }""" % dict(
-            just_fail=just_fail
-        )
-        params = sub["params"]
-        return (
-            """
-        size_t old_size = 1, new_size = 1;
-        size_t* new_dims = NULL;
-        int compute_axis = -1;
-        assert (PyArray_NDIM(%(shape)s) == 1);
-        if (PyArray_DIM(%(shape)s, 0) != %(params)s->ndim)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuReshape: given shape is of incorrect "
-                         "length (%%d should be %%d).",
-                         PyArray_DIM(%(shape)s, 0), %(params)s->ndim);
-            %(just_fail)s;
-        }
-        new_dims = (size_t*) malloc(sizeof(size_t) * %(params)s->ndim);
-        if (new_dims == NULL) {
-            PyErr_NoMemory();
-            %(just_fail)s
-        }
-        for (size_t i = 0; i < %(x)s->ga.nd; ++i)
-            old_size *= %(x)s->ga.dimensions[i];
-        for (size_t i = 0; i < %(params)s->ndim; ++i)
-        {
-            new_dims[i] = ((%(sdtype)s*)(
-                    PyArray_BYTES(%(shape)s) +
-                    i * PyArray_STRIDES(%(shape)s)[0]))[0];
-            if (new_dims[i] == -1)
-            {
-                if (compute_axis != -1)
-                {
-                    PyErr_Format(PyExc_ValueError,
-                                 "GpuReshape: only one -1 is accepted "
-                                 "in the new shape, but got two at "
-                                 "indices %%d and %%zu.",
-                                 compute_axis, i);
-                    %(fail)s;
-                }
-                compute_axis = i;
-            }
-            else
-                new_size *= new_dims[i];
-        }
-        if (compute_axis == -1 && new_size != old_size)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuReshape: trying to reshape an array of "
-                         "total size %%zu into an array of total size "
-                         "%%zu.", old_size, new_size);
-            %(fail)s;
-        }
-        else if (compute_axis != -1 && old_size %% new_size != 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuReshape: -1 axis found at index %%d in "
-                         "new shape but the total size of the array "
-                         "(%%zu) is not divisible by the given shapes "
-                         "(%%zu).", compute_axis, old_size, new_size);
-            %(fail)s;
-        }
-        Py_XDECREF(%(output)s);
-        %(output)s = pygpu_reshape(%(x)s, %(params)s->ndim, new_dims,
-                                   GA_C_ORDER, 0, compute_axis);
-        free(new_dims);
-        if (%(output)s == NULL)
-        {
-            %(just_fail)s;
-        }
-        """
-            % locals()
-        )
-class GpuJoin(HideC, Join):
-    """
-    Join for GPU.
-    """
-    _f16_ok = True
-    __props__ = ("view",)
-    params_type = gpu_context_type
-    def __init__(self, view=-1):
-        self.view = view
-        if view != -1:
-            # since the first input is always the axis, the tensors
-            # start from index 1.
-            self.view_map = {0: [1 + view]}
-    def __str__(self):
-        return Join.__str__(self)
-    def make_node(self, axis, *tensors):
-        node = Join.make_node(self, axis, *tensors)
-        ctx_name = infer_context_name(*tensors)
-        def agv(v):
-            return as_gpuarray_variable(v, context_name=ctx_name)
-        return Apply(
-            self,
-            [node.inputs[0]] + list(map(agv, tensors)),
-            [
-                GpuArrayType(
-                    broadcastable=node.outputs[0].broadcastable,
-                    dtype=node.outputs[0].dtype,
-                    context_name=ctx_name,
-                )()
-            ],
-        )
-    def get_params(self, node):
-        return node.outputs[0].type.context
-    def perform(self, node, axis_and_tensors, out_, ctx):
-        (out,) = out_
-        view = self.view
-        axis = int(axis_and_tensors[0])
-        tensors = axis_and_tensors[1:]
-        if axis < -axis_and_tensors[1].ndim:
-            raise IndexError
-        if axis < 0:
-            axis += axis_and_tensors[1].ndim
-        # we check these tensors for being empty.
-        if (view != -1) and np.all(
-            [
-                tensor.shape[axis] == 0
-                for tensor in tensors[0:view] + tensors[view + 1 :]
-            ]
-        ):
-            out[0] = tensors[view]
-        else:
-            out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
-                node.outputs[0].dtype
-            )
-    def c_code_cache_version(self):
-        return (3,)
-    def c_support_code(self, **kwargs):
-        return """
-        #if PY_MAJOR_VERSION >= 3
-        #define PyInt_AsLong PyLong_AsLong
-        #endif
-        """
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>"]
-    def c_code(self, node, name, inputs, out_, sub):
-        axis, tensors = inputs[0], inputs[1:]
-        copy_to_list = []
-        restype = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        view = self.view
-        non_empty_tensor = tensors[view]
-        for i, inp in enumerate(tensors):
-            copy_to_list.append(f"als[{i}] = &{inp}->ga;")
-        n = len(tensors)
-        fail = sub["fail"]
-        out = out_[0]
-        copy_inputs_to_list = "\n".join(copy_to_list)
-        ctx = sub["params"]
-        code = (
-            """
-        const GpuArray **als = (const GpuArray **)PyMem_Malloc(sizeof(GpuArray *) *
-                                                       %(n)s);
-        if (als == NULL) {
-            PyErr_NoMemory();
-            %(fail)s
-        }
-        %(copy_inputs_to_list)s
-        Py_XDECREF(%(out)s);
-        {
-            int axis = PyInt_AsLong((PyObject *)%(axis)s);
-            if (axis < 0) {
-                if (axis == -1 && PyErr_Occurred()) {
-                    %(fail)s
-                }
-                axis += als[0]->nd;
-                if (axis < 0) {
-                    PyErr_SetString(PyExc_IndexError, "invalid axis");
-                    %(fail)s
-                }
-            }
-            int tensors_lens_sum;
-            if(%(view)s != -1) {
-                tensors_lens_sum = 0;
-                for(int i=0; i < %(n)s; i++){
-                    tensors_lens_sum += als[i]->dimensions[axis];
-                }
-                tensors_lens_sum -= PyGpuArray_DIM(%(non_empty_tensor)s, axis);
-            }
-            if(%(view)s != -1 && tensors_lens_sum == 0) {
-                Py_INCREF(%(non_empty_tensor)s);
-                %(out)s = %(non_empty_tensor)s;
-            }else{
-                %(out)s = pygpu_concatenate(als, %(n)s, axis,
-                                            %(restype)s, (PyObject *)&PyGpuArrayType,
-                                            %(ctx)s);
-            }
-        }
-        PyMem_Free(als);
-        if (%(out)s == NULL)
-            %(fail)s
-        """
-            % locals()
-        )
-        return code
-gpu_join = GpuJoin()
-class GpuSplit(HideC, Split, _NoPythonOp):
-    """
-    Split for GPU.
-    """
-    _f16_ok = True
-    def __init__(self, len_splits):
-        super().__init__(len_splits)
-        # The GPU version of Split returns splits as views of the input.
-        self.view_map = {}
-        for i in range(self.len_splits):
-            self.view_map[i] = [0]
-    def make_node(self, x, axis, splits):
-        node = Split.make_node(self, x, axis, splits)
-        x = as_gpuarray_variable(x, infer_context_name(x))
-        outs = [
-            GpuArrayType(
-                dtype=o.dtype,
-                broadcastable=o.broadcastable,
-                context_name=x.type.context_name,
-            )()
-            for o in node.outputs
-        ]
-        return Apply(self, [x] + node.inputs[1:], outs)
-    # we reuse the perform of the CPU op, which is suitable
-    def c_code_cache_version(self):
-        return (2,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray_helper.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [pygpu.get_include(), gpuarray_helper_inc_dir()]
-    def c_code(self, node, name, inputs, outputs, sub):
-        if self.len_splits == 0:
-            # There are no outputs, then nothing to do.
-            return ""
-        # outputs_pointers lists the addresses of the pointers to the outputs.
-        outputs_pointers = "&" + (", &".join(outputs))
-        x, axis, splits = inputs
-        fail = sub["fail"]
-        splits_dtype = node.inputs[2].type.dtype_specs()[1]
-        axis_dtype = node.inputs[1].type.dtype_specs()[1]
-        expected_splits_count = self.len_splits
-        main_code = """
-        int ndim = PyGpuArray_NDIM(%(x)s);
-        int axis = (int)(*(%(axis_dtype)s*)PyArray_GETPTR1(%(axis)s, 0));
-        int splits_count = PyArray_DIM(%(splits)s, 0);
-        size_t len_along_axis, sum_of_splits = 0;
-        %(splits_dtype)s current_split_length;
-        size_t* split_points = NULL;
-        GpuArray* split_views = NULL;
-        GpuArray** split_views_pointers = NULL;
-        int i, j;
-        PyGpuArrayObject** outputs[] = {%(outputs_pointers)s};
-        /* Check inputs. */
-        if (splits_count != %(expected_splits_count)s) {
-            PyErr_Format(PyExc_ValueError,
-                "GpuSplit: splits count (%%d) != expected count (%%d).", splits_count, %(expected_splits_count)s);
-            %(fail)s
-        }
-        if (axis < 0) {
-            axis += ndim;
-        }
-        if (axis < 0 || axis >= ndim) {
-            PyErr_Format(PyExc_IndexError, "GpuSplit: invalid axis %%d for a %%d-D array.", axis, ndim);
-            %(fail)s
-        }
-        len_along_axis = PyGpuArray_DIM(%(x)s, axis);
-        for (i = 0; i < splits_count; ++i) {
-            current_split_length = *(%(splits_dtype)s*)PyArray_GETPTR1(%(splits)s, i);
-            if (current_split_length < 0) {
-                PyErr_Format(PyExc_ValueError,
-                    "GpuSplit: you try to take a negative number (%%ld) of elements.", current_split_length);
-                %(fail)s
-            }
-            sum_of_splits += current_split_length;
-        }
-        if (sum_of_splits != len_along_axis) {
-            PyErr_Format(PyExc_ValueError, "GpuSplit: the splits sums to %%ld, expected %%ld.", sum_of_splits, len_along_axis);
-            %(fail)s
-        }
-        /* Compute splits views. */
-        split_points = (size_t*) malloc((splits_count - 1) * sizeof(size_t));
-        if (split_points == NULL) {
-            PyErr_NoMemory();
-            %(fail)s
-        }
-        split_points[0] = (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, 0) );
-        for(i = 1; i < splits_count - 1; ++i) {
-            split_points[i] = split_points[i - 1] + (size_t) (* (%(splits_dtype)s*) PyArray_GETPTR1(%(splits)s, i) );
-        }
-        split_views = (GpuArray*) malloc(splits_count * sizeof(GpuArray));
-        split_views_pointers = (GpuArray**) malloc(splits_count * sizeof(GpuArray*));
-        if (split_views == NULL || split_views_pointers == NULL) {
-            PyErr_NoMemory();
-            free(split_views_pointers);
-            free(split_views);
-            free(split_points);
-            %(fail)s
-        }
-        for (i = 0; i < splits_count; ++i) {
-            split_views_pointers[i] = split_views + i;
-        }
-        if (GpuArray_split(split_views_pointers, &%(x)s->ga, splits_count - 1, split_points, axis) != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to compute split.");
-            for (i = 0; i < splits_count; ++i) {
-                GpuArray_clear(split_views_pointers[i]);
-            }
-            free(split_views_pointers);
-            free(split_views);
-            free(split_points);
-            %(fail)s
-        }
-        /* Put split views into outputs. */
-        for (i = 0; i < splits_count; ++i) {
-            PyGpuArrayObject** output = outputs[i];
-            Py_XDECREF(*output);
-            *output = pygpu_fromgpudata(
-                split_views[i].data,
-                split_views[i].offset,
-                split_views[i].typecode,
-                split_views[i].nd,
-                split_views[i].dimensions,
-                split_views[i].strides,
-                %(x)s->context,
-                1, // output is writable
-                Py_None, Py_None
-            );
-            if (*output == NULL) {
-                PyErr_SetString(PyExc_RuntimeError, "GpuSplit: unable to update an output from a split view.");
-                for (j = 0; j < splits_count; ++j) {
-                    GpuArray_clear(split_views_pointers[j]);
-                }
-                free(split_views_pointers);
-                free(split_views);
-                free(split_points);
-                %(fail)s
-            }
-        }
-        /* Free memory. */
-        for (i = 0; i < splits_count; ++i) {
-           GpuArray_clear(split_views_pointers[i]);
-        }
-        free(split_views_pointers);
-        free(split_views);
-        free(split_points);
-        """
-        return main_code % locals()
-@aesara.compile.profiling.register_profiler_printer
-def profile_printer(
-    message, compile_time, fct_call_time, apply_time, apply_cimpl, outputs_size, file
-):
-    if any(
-        [
-            x.op.__class__.__name__.lower().startswith("gpu")
-            for (fgraph, x) in apply_time.keys()
-        ]
-    ):
-        local_time = sum(apply_time.values())
-        print("", file=file)
-        print("Some info useful for gpu:", file=file)
-        fgraphs = set()
-        for fgraph, node in apply_time.keys():
-            fgraphs.add(fgraph)
-        cpu = 0
-        gpu = 0
-        trans = 0
-        for (fgraph, node), t in apply_time.items():
-            if isinstance(node.op, (HostFromGpu, GpuFromHost)):
-                trans += t
-            elif node.op.__class__.__name__.lower().startswith("gpu"):
-                gpu += t
-            else:
-                cpu += t
-        print("", file=file)
-        print(
-            "    Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op"
-            % (
-                cpu,
-                cpu / local_time * 100,
-                gpu,
-                gpu / local_time * 100,
-                trans,
-                trans / local_time * 100,
-            ),
-            file=file,
-        )
-        print("", file=file)
-        print("    Aesara function input that are float64", file=file)
-        print("    <fct name> <input name> <input type> <str input>", file=file)
-        for fg in fgraphs:
-            for i in fg.inputs:
-                if hasattr(i.type, "dtype") and i.type.dtype == "float64":
-                    print("        ", fg.name, i.name, i.type, i, file=file)
-        print("", file=file)
-        print(
-            "    List of apply that don't have float64 as input but have float64 in outputs",
-            file=file,
-        )
-        print(
-            "    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)",
-            file=file,
-        )
-        print(
-            "    <Apply> <Apply position> <fct name> <inputs type> <outputs type>",
-            file=file,
-        )
-        for fg in fgraphs:
-            for idx, node in enumerate(fg.toposort()):
-                if any(
-                    hasattr(i, "dtype") and i.dtype == "float64" for i in node.outputs
-                ) and not any(
-                    hasattr(i, "dtype") and i.dtype == "float64" for i in node.inputs
-                ):
-                    print("        ", str(node), idx, fg.name, end=" ", file=file)
-                    print(
-                        str([getattr(i, "dtype", None) for i in node.inputs]),
-                        end=" ",
-                        file=file,
-                    )
-                    print(
-                        str([getattr(i, "dtype", None) for i in node.outputs]),
-                        file=file,
-                    )
-        print("", file=file)
-class GpuEye(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Eye for GPU.
-    """
-    __props__ = ("dtype", "context_name")
-    _f16_ok = True
-    def __init__(self, dtype=None, context_name=None):
-        if dtype is None:
-            dtype = config.floatX
-        self.dtype = dtype
-        self.context_name = context_name
-    def get_params(self, node):
-        return get_context(self.context_name)
-    def make_node(self, n, m, k):
-        n = at.as_tensor_variable(n)
-        m = at.as_tensor_variable(m)
-        k = at.as_tensor_variable(k)
-        assert n.ndim == 0
-        assert m.ndim == 0
-        assert k.ndim == 0
-        otype = GpuArrayType(
-            dtype=self.dtype,
-            broadcastable=(False, False),
-            context_name=self.context_name,
-        )
-        return Apply(self, [n, m, k], [otype()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        out_shape = [node.inputs[0], node.inputs[1]]
-        return [out_shape]
-    def grad(self, inp, grads):
-        return [grad_undefined(self, i, inp[i]) for i in range(3)]
-    def gpu_kernels(self, node, name):
-        code = """#include "cluda.h"
-KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
-                ga_size n, ga_size m, ga_ssize k) {
-    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
-    ga_ssize coff = max(k, (ga_ssize) 0);
-    ga_ssize roff = -min(k, (ga_ssize) 0);
-    ga_size nb = (ga_size) min(n - roff, m - coff);
-    for (ga_size i = LID_0; i < nb; i += LDIM_0) {
-        a[(i + roff)*m + i + coff] = %(write_a)s(1);
-    }
-}""" % dict(
-            ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
-            name=name,
-            write_a=write_w(self.dtype),
-        )
-        return [
-            Kernel(
-                code=code,
-                name="eye",
-                params=[
-                    gpuarray.GpuArray,
-                    gpuarray.SIZE,
-                    gpuarray.SIZE,
-                    gpuarray.SIZE,
-                    gpuarray.SSIZE,
-                ],
-                flags=Kernel.get_flags(self.dtype),
-                objvar="k_eye_" + name,
-            )
-        ]
-    def c_code(self, node, name, inp, out, sub):
-        if len(inp) == 2:
-            n, m = inp
-            k = 0
-        elif len(inp) == 3:
-            n, m, k = inp
-        (z,) = out
-        fail = sub["fail"]
-        ctx = sub["params"]
-        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
-        kname = self.gpu_kernels(node, name)[0].objvar
-        s = (
-            """
-        size_t dims[2] = {0, 0};
-        size_t ls, gs;
-        ssize_t k;
-        size_t col_off;
-        size_t row_off;
-        int err;
-        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
-        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
-        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
-        Py_CLEAR(%(z)s);
-        %(z)s = pygpu_zeros(2, dims,
-                            %(typecode)s,
-                            GA_C_ORDER,
-                            %(ctx)s, Py_None);
-        if (%(z)s == NULL) {
-            %(fail)s
-        }
-        ls = 1;
-        gs = 256;
-        col_off = (size_t) (k > 0?k:0);
-        row_off = (size_t) (k < 0?-k:0);
-        if (row_off < dims[0] && col_off < dims[1]) {
-            err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
-                           dims[0], dims[1], k);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: kEye: %%s. n%%lu, m=%%lu.",
-                             GpuKernel_error(&%(kname)s, err),
-                             (unsigned long)dims[0], (unsigned long)dims[1]);
-                %(fail)s;
-            }
-        }
-        """
-            % locals()
-        )
-        return s
-    def c_code_cache_version(self):
-        return (10,)
-class GpuTri(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Tri for GPU.
-    """
-    __props__ = ("dtype", "context_name")
-    _f16_ok = True
-    def __init__(self, dtype=None, context_name=None):
-        if dtype is None:
-            dtype = config.floatX
-        self.dtype = dtype
-        self.context_name = context_name
-    def get_params(self, node):
-        return get_context(self.context_name)
-    def make_node(self, n, m, k):
-        n = at.as_tensor_variable(n)
-        m = at.as_tensor_variable(m)
-        k = at.as_tensor_variable(k)
-        assert n.ndim == 0
-        assert m.ndim == 0
-        assert k.ndim == 0
-        otype = GpuArrayType(
-            dtype=self.dtype,
-            broadcastable=(False, False),
-            context_name=self.context_name,
-        )
-        return Apply(self, [n, m, k], [otype()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        out_shape = [node.inputs[0], node.inputs[1]]
-        return [out_shape]
-    def grad(self, inp, grads):
-        return [grad_undefined(self, i, inp[i]) for i in range(3)]
-    def gpu_kernels(self, node, name):
-        code = """#include "cluda.h"
-KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
-                ga_size n, ga_size m, ga_ssize k) {
-    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
-    ga_ssize coff = max(k, (ga_ssize) 0);
-    ga_ssize roff = -min(k, (ga_ssize) 0);
-    for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
-        for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
-          a[(i + roff)*m + j] = %(write_a)s(1);
-        }
-    }
-}""" % dict(
-            ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
-            name=name,
-            write_a=write_w(self.dtype),
-        )
-        return [
-            Kernel(
-                code=code,
-                name="tri",
-                params=[
-                    gpuarray.GpuArray,
-                    gpuarray.SIZE,
-                    gpuarray.SIZE,
-                    gpuarray.SIZE,
-                    gpuarray.SSIZE,
-                ],
-                flags=Kernel.get_flags(self.dtype),
-                objvar="k_tri_" + name,
-            )
-        ]
-    def c_code(self, node, name, inp, out, sub):
-        if len(inp) == 2:
-            n, m = inp
-            k = 0
-        elif len(inp) == 3:
-            n, m, k = inp
-        (z,) = out
-        fail = sub["fail"]
-        ctx = sub["params"]
-        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
-        kname = self.gpu_kernels(node, name)[0].objvar
-        s = (
-            """
-        size_t dims[2] = {0, 0};
-        size_t ls, gs;
-        ssize_t k;
-        int err;
-        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
-        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
-        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
-        Py_CLEAR(%(z)s);
-        %(z)s = pygpu_zeros(2, dims,
-                            %(typecode)s,
-                            GA_C_ORDER,
-                            %(ctx)s, Py_None);
-        if (%(z)s == NULL) {
-            %(fail)s
-        }
-        ls = 1;
-        gs = 256;
-        err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
-                       dims[0], dims[1], k);
-        if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
-                         GpuKernel_error(&%(kname)s, err),
-                         (unsigned long)dims[0], (unsigned long)dims[1]);
-            %(fail)s;
-        }
-        """
-            % locals()
-        )
-        return s
-    def c_code_cache_version(self):
-        return (1,)
--- a/aesara/gpuarray/blas.py
+++ b/aesara/gpuarray/blas.py
-import aesara
-from aesara.compile import optdb
-from aesara.gpuarray.basic_ops import (
-    CGpuKernelBase,
-    GpuArrayType,
-    as_gpuarray_variable,
-    gpu_contiguous,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.opt_util import inplace_allocempty
-from aesara.graph.basic import Apply
-from aesara.graph.opt import LocalOptGroup, in2out
-from aesara.link.c.op import _NoPythonCOp
-from aesara.link.c.params_type import ParamsType
-from aesara.scalar import bool as bool_t
-from aesara.tensor.basic import as_tensor_variable
-try:
-    import pygpu
-    from pygpu import blas
-except ImportError:
-    # To make sure aesara is importable
-    pass
-class BlasOp(_NoPythonCOp):
-    def c_headers(self, **kwargs):
-        return ["<blas_api.h>", "<numpy_compat.h>", "<gpuarray_helper.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [pygpu.get_include(), gpuarray_helper_inc_dir()]
-    def c_init_code(self, **kwargs):
-        return ["import_pygpu__blas();"]
-class GpuGemv(BlasOp):
-    """
-    Gemv on the GPU.
-    """
-    params_type = ParamsType(inplace=bool_t)
-    __props__ = ("inplace",)
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, y, alpha, A, x, beta):
-        ctx_name = infer_context_name(y, A, x)
-        A = as_gpuarray_variable(A, ctx_name)
-        x = as_gpuarray_variable(x, ctx_name)
-        y = as_gpuarray_variable(y, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        beta = as_tensor_variable(beta)
-        assert alpha.ndim == 0
-        assert beta.ndim == 0
-        assert A.ndim == 2
-        assert x.ndim == 1
-        assert y.ndim == 1
-        assert A.dtype == x.dtype == y.dtype
-        # float16 not supported
-        expected = A.dtype
-        assert aesara.scalar.upcast(alpha.dtype, beta.dtype, expected) == expected
-        alpha = alpha.astype(expected)
-        beta = beta.astype(expected)
-        return Apply(self, [y, alpha, A, x, beta], [y.type()])
-    def perform(self, node, inputs, out_storage, params):
-        y, alpha, A, x, beta = inputs
-        inplace = params.inplace
-        if inplace and y.strides[0] < 0:
-            inplace = False
-        if A.shape[1] == 0:
-            out_storage[0][0] = pygpu.zeros(y.shape, dtype=y.dtype, context=y.context)
-        else:
-            out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, overwrite_y=inplace)
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(
-            out=out[0],
-            y=inp[0],
-            alpha=inp[1],
-            A=inp[2],
-            x=inp[3],
-            beta=inp[4],
-            fail=sub["fail"],
-            name=name,
-            params=sub["params"],
-        )
-        code = (
-            """
-               if (!%(params)s->inplace || %(y)s->ga.strides[0] <= 0) {
-                 %(out)s = aesara_try_copy(%(out)s, %(y)s);
-                 if (%(out)s == NULL) {
-                   %(fail)s
-                 }
-               } else {
-                 Py_XDECREF(%(out)s);
-                 %(out)s = %(y)s;
-                 Py_INCREF(%(out)s);
-               }
-               """
-            % vars
-        )
-        # in case of possible speed up using blas dot,
-        # temporary hack A to 1D for vector-vector dot
-        code += (
-            """
-        if (PyGpuArray_DIM(%(A)s, 1) == 0) {
-          int code;
-          code = GpuArray_memset(&%(out)s->ga, 0);
-          if (code != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Memset failed");
-            %(fail)s
-          }
-        } else if ( PyGpuArray_DIM(%(A)s, 0) == 1
-         &&((dtype_%(alpha)s*)PyArray_DATA(%(alpha)s))[0] == (dtype_%(alpha)s)1.
-         &&((dtype_%(beta)s*)PyArray_DATA(%(beta)s))[0] == (dtype_%(beta)s)0.
-        ) {
-            %(out)s->ga.nd = 0;
-            %(A)s->ga.nd = 1;
-            %(A)s->ga.dimensions[0] = %(A)s->ga.dimensions[1];
-            ssize_t a_stride0 = %(A)s->ga.strides[0];
-            %(A)s->ga.strides[0] = %(A)s->ga.strides[1];
-            if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
-                %(fail)s
-            }
-            %(A)s->ga.strides[0] = a_stride0;
-            %(out)s->ga.nd = 1;
-            %(A)s->ga.nd = 2;
-            %(A)s->ga.dimensions[0] = 1;
-        } else if (
-            pygpu_blas_rgemv(cb_no_trans,
-            ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-            %(A)s, %(x)s,
-            ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-            %(out)s, 0) == -1) {
-            %(fail)s
-        }
-        """
-            % vars
-        )
-        return code
-    def c_code_cache_version(self):
-        return (10,)
-gpugemv_no_inplace = GpuGemv(inplace=False)
-gpugemv_inplace = GpuGemv(inplace=True)
-class GpuGemm(BlasOp):
-    """
-    Gemm on the GPU.
-    """
-    params_type = ParamsType(inplace=bool_t)
-    __props__ = ("inplace",)
-    _f16_ok = True
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, C, alpha, A, B, beta):
-        ctx_name = infer_context_name(C, A, B)
-        A = as_gpuarray_variable(A, ctx_name)
-        B = as_gpuarray_variable(B, ctx_name)
-        C = as_gpuarray_variable(C, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        beta = as_tensor_variable(beta)
-        if not (A.dtype == B.dtype == C.dtype):
-            raise TypeError(
-                aesara.tensor.blas.Gemm.E_mixed,
-                (A.dtype, B.dtype, C.dtype, alpha.dtype, beta.dtype),
-            )
-        if not A.dtype.startswith("float"):
-            raise TypeError(aesara.tensor.blas.Gemm.E_float, (A.dtype))
-        if A.dtype == "float16":
-            expected = "float32"
-        else:
-            expected = A.dtype
-        assert aesara.scalar.upcast(alpha.dtype, beta.dtype, expected) == expected
-        alpha = alpha.astype(expected)
-        beta = beta.astype(expected)
-        assert alpha.ndim == 0
-        assert beta.ndim == 0
-        assert A.ndim == 2
-        assert B.ndim == 2
-        assert C.ndim == 2
-        return Apply(self, [C, alpha, A, B, beta], [C.type()])
-    def perform(self, node, inputs, outputs, params):
-        C, alpha, A, B, beta = inputs
-        inplace = params.inplace
-        if inplace and not C.flags.forc:
-            inplace = False
-        outputs[0][0] = blas.gemm(alpha, A, B, beta, C, overwrite_c=inplace)
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(
-            out=out[0],
-            C=inp[0],
-            alpha=inp[1],
-            A=inp[2],
-            B=inp[3],
-            beta=inp[4],
-            fail=sub["fail"],
-            name=name,
-            params=sub["params"],
-        )
-        code = (
-            """
-               if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                 %(out)s = aesara_try_copy(%(out)s, %(C)s);
-                 if (%(out)s == NULL) {
-                   %(fail)s
-                 }
-               } else {
-                 Py_XDECREF(%(out)s);
-                 %(out)s = %(C)s;
-                 Py_INCREF(%(out)s);
-               }
-               if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
-                                    ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-                                    %(A)s, %(B)s,
-                                    ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                                    %(out)s, 0) == -1) {
-                 %(fail)s
-               }
-        """
-            % vars
-        )
-        return code
-    def c_code_cache_version(self):
-        return (7,)
-gpugemm_no_inplace = GpuGemm(inplace=False)
-gpugemm_inplace = GpuGemm(inplace=True)
-class GpuGer(BlasOp):
-    """
-    Ger on the GPU.
-    """
-    params_type = ParamsType(inplace=bool_t)
-    __props__ = ("inplace",)
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, A, alpha, x, y):
-        ctx_name = infer_context_name(A, x, y)
-        A = as_gpuarray_variable(A, ctx_name)
-        x = as_gpuarray_variable(x, ctx_name)
-        y = as_gpuarray_variable(y, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        if not (A.dtype == x.dtype == y.dtype):
-            raise TypeError(
-                "ger requires matching dtypes", (A.dtype, alpha.dtype, x.dtype, y.dtype)
-            )
-        assert aesara.scalar.upcast(alpha.dtype, A.dtype) == A.dtype
-        alpha = alpha.astype(A.dtype)
-        assert alpha.ndim == 0
-        assert A.ndim == 2
-        assert x.ndim == 1
-        assert y.ndim == 1
-        return Apply(self, [A, alpha, x, y], [A.type()])
-    def perform(self, node, inp, out, params):
-        A, alpha, x, y = inp
-        inplace = params.inplace
-        if inplace and not A.flags.forc:
-            inplace = False
-        out[0][0] = blas.ger(alpha, x, y, A, overwrite_a=inplace)
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(
-            out=out[0],
-            A=inp[0],
-            alpha=inp[1],
-            x=inp[2],
-            y=inp[3],
-            fail=sub["fail"],
-            name=name,
-            params=sub["params"],
-        )
-        code = (
-            """
-               if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(A)s->ga)) {
-                 %(out)s = aesara_try_copy(%(out)s, %(A)s);
-                 if (%(out)s == NULL) {
-                   %(fail)s
-                 }
-               } else {
-                 Py_XDECREF(%(out)s);
-                 %(out)s = %(A)s;
-                 Py_INCREF(%(out)s);
-               }
-               if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-                                %(x)s, %(y)s, %(out)s, 0) == -1) {
-                 %(fail)s
-               }
-               """
-            % vars
-        )
-        return code
-    def c_code_cache_version(self):
-        return (5,)
-gpuger_no_inplace = GpuGer(inplace=False)
-gpuger_inplace = GpuGer(inplace=True)
-class GpuDot22(BlasOp):
-    """
-    Dot22 on the GPU.
-    """
-    _f16_ok = True
-    __props__ = ()
-    def make_node(self, x, y):
-        ctx_name = infer_context_name(x, y)
-        x = as_gpuarray_variable(x, ctx_name)
-        y = as_gpuarray_variable(y, ctx_name)
-        assert x.ndim == 2
-        assert y.ndim == 2
-        assert x.dtype == y.dtype
-        otype = x.type.clone(
-            broadcastable=(x.type.broadcastable[0], y.type.broadcastable[1])
-        )
-        return Apply(self, [x, y], [otype()])
-    def perform(self, node, inputs, outputs):
-        x, y = inputs
-        out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype, context=x.context)
-        outputs[0][0] = blas.gemm(1.0, x, y, 0.0, out, overwrite_c=True)
-    def c_code(self, node, name, inputs, outputs, sub):
-        dtype = node.inputs[0].dtype
-        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
-        vars = dict(
-            A=inputs[0],
-            B=inputs[1],
-            dtype=dtype,
-            out=outputs[0],
-            typecode=typecode,
-            fail=sub["fail"],
-            name=name,
-        )
-        code = (
-            """
-        double one = 1.;
-        double zero = 0.;
-        size_t dims[] = {0, 0};
-        dims[0] = PyGpuArray_DIMS(%(A)s)[0];
-        dims[1] = PyGpuArray_DIMS(%(B)s)[1];
-        if (aesara_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
-                               %(A)s->context)) {
-            %(fail)s
-        }
-        if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
-                             one,
-                             %(A)s, %(B)s,
-                             zero,
-                             %(out)s, 0) == -1) {
-            %(fail)s
-        }
-        """
-            % vars
-        )
-        return code
-    def c_code_cache_version(self):
-        return (5,)
-gpu_dot22 = GpuDot22()
-class GpuGemmBatch(BlasOp, _NoPythonCOp):
-    params_type = ParamsType(inplace=bool_t)
-    __props__ = ("inplace",)
-    _f16_ok = True
-    def __init__(self, inplace=False):
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, C, alpha, A, B, beta):
-        ctx_name = infer_context_name(C, A, B)
-        A = as_gpuarray_variable(A, ctx_name)
-        B = as_gpuarray_variable(B, ctx_name)
-        C = as_gpuarray_variable(C, ctx_name)
-        alpha = as_tensor_variable(alpha)
-        if alpha.dtype == "float16":
-            alpha = alpha.astype("float32")
-        beta = as_tensor_variable(beta)
-        if beta.dtype == "float16":
-            beta = beta.astype("float32")
-        assert alpha.ndim == 0
-        assert beta.ndim == 0
-        assert A.ndim == 3
-        assert B.ndim == 3
-        assert C.ndim == 3
-        assert A.dtype == B.dtype == C.dtype
-        if A.dtype in ("float32", "float64"):
-            assert A.dtype == alpha.dtype == beta.dtype
-        else:
-            assert "float32" == alpha.dtype == beta.dtype
-        return Apply(self, [C, alpha, A, B, beta], [C.type()])
-    def c_headers(self, **kwargs):
-        return super().c_headers(**kwargs) + ["<gpuarray/blas.h>"]
-    def c_code(self, node, name, inp, out, sub):
-        vars = dict(
-            out=out[0],
-            C=inp[0],
-            alpha=inp[1],
-            A=inp[2],
-            B=inp[3],
-            beta=inp[4],
-            params=sub["params"],
-            fail=sub["fail"],
-            name=name,
-        )
-        code = (
-            """
-        int err;
-        if (%(params)s->inplace){
-                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                     %(out)s = aesara_try_copy(%(out)s, %(C)s);
-                     if (%(out)s == NULL) {
-                       %(fail)s
-                     }
-                   } else {
-                     Py_XDECREF(%(out)s);
-                     %(out)s = %(C)s;
-                     Py_INCREF(%(out)s);
-                   }
-        } else {
-                   %(out)s = aesara_try_copy(%(out)s, %(C)s);
-                   if (%(out)s == NULL) {
-                       %(fail)s
-                   }
-        }
-        err = GpuArray_rgemmBatch_3d(
-            cb_no_trans, cb_no_trans,
-            ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
-            &%(A)s->ga, &%(B)s->ga,
-            ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-            &%(out)s->ga, 0);
-        if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "%%s", GpuArray_error(&%(A)s->ga, err));
-            %(fail)s;
-        }
-        """
-            % vars
-        )
-        return code
-    def c_code_cache_version(self):
-        return (4,)
-gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
-gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
-class BaseGpuCorrMM(CGpuKernelBase):
-    """
-    Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
-    `GpuCorrMM_gradInputs`. Cannot be used directly.
-    Parameters
-    ----------
-    border_mode : {'valid', 'full', 'half'}
-        Additionally, the padding size could be directly specified by an integer,
-        a pair of integers, or two pairs of integers.
-    subsample
-        Perform subsampling of the output (default: (1, 1)).
-    filter_dilation
-        Perform subsampling of the input, also known as dilation (default: (1, 1)).
-    num_groups :
-        Divides the image, kernel and output tensors into num_groups
-        separate groups. Each which carry out convolutions separately (default : 1).
-    unshared
-        Perform unshared correlation (default: False)
-    """
-    check_broadcast = False
-    __props__ = (
-        "border_mode",
-        "subsample",
-        "filter_dilation",
-        "num_groups",
-        "unshared",
-    )
-    _f16_ok = True
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1),
-        filter_dilation=(1, 1),
-        num_groups=1,
-        unshared=False,
-    ):
-        if isinstance(border_mode, int):
-            if border_mode < 0:
-                raise ValueError(
-                    "invalid border_mode {}, which must be a "
-                    "non-negative integer".format(border_mode)
-                )
-            border_mode = ((border_mode, border_mode),) * 2
-        elif isinstance(border_mode, tuple):
-            if len(border_mode) != 2:
-                raise ValueError(
-                    "invalid border_mode {} which must be a "
-                    "tuple of length 2".format(border_mode)
-                )
-            border = ()
-            for mode in border_mode:
-                if isinstance(mode, tuple) and len(mode) == 2 and min(mode) >= 0:
-                    border += ((int(mode[0]), int(mode[1])),)
-                elif mode >= 0:
-                    border += ((int(mode), int(mode)),)
-                else:
-                    raise ValueError(
-                        "invalid border mode {}. The tuple can only contain "
-                        "integers or tuples of length 2".format(border_mode)
-                    )
-            border_mode = border
-        elif border_mode not in ("valid", "full", "half"):
-            raise ValueError(
-                "invalid border_mode {}, which must be either "
-                '"valid", "full", "half", an integer or a tuple '
-                "of length 2".format(border_mode)
-            )
-        self.border_mode = border_mode
-        if len(subsample) != 2:
-            raise ValueError("subsample must have two elements")
-        if len(filter_dilation) != 2:
-            raise ValueError("filter_dilation must have two elements")
-        self.subsample = tuple(subsample)
-        self.filter_dilation = tuple(filter_dilation)
-        if num_groups < 1:
-            raise ValueError("Number of groups should be greater than 0")
-        self.num_groups = num_groups
-        CGpuKernelBase.__init__(self, ["c_code/corr_gemm.c"])
-        self.unshared = unshared
-    @property
-    def pad(self):
-        if self.border_mode != "valid":
-            return self.border_mode
-        return ((0, 0),) * 2
-    def __str__(self):
-        return "{}{{{}, {}, {}, {}, {}}}".format(
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            str(self.filter_dilation),
-            str(self.num_groups),
-            str(self.unshared),
-        )
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "num_groups"):
-            self.num_groups = 1
-    def flops(self, inp, outp):
-        """
-        Useful with the hack in profilemode to print the MFlops.
-        """
-        # if the output shape is correct, then this gives the correct
-        # flops for any direction, sampling, padding, and border mode
-        inputs, filters = inp
-        (outputs,) = outp
-        assert inputs[1] == (filters[1] * self.num_groups)
-        # nb mul and add by output pixel
-        flops = filters[2] * filters[3] * 2
-        # nb flops by output image
-        flops *= outputs[2] * outputs[3]
-        # nb patch multiplied
-        flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
-        return flops
-    def c_headers(self, **kwargs):
-        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_code_cache_version(self):
-        # Raise this whenever modifying the C code (including the file).
-        return (12,)
-    def c_code_helper(
-        self, bottom, weights, top, direction, sub, height=None, width=None
-    ):
-        """
-        This generates the C code for GpuCorrMM (direction="forward"),
-        GpuCorrMM_gradWeights (direction="backprop weights"), and
-        GpuCorrMM_gradInputs (direction="backprop inputs").
-        Depending on the direction, one of bottom, weights, top will
-        receive the output, while the other two serve as inputs.
-        Parameters
-        ----------
-        bottom
-            Variable name of the input images in the forward pass,
-            or the gradient of the input images in backprop wrt. inputs
-        weights
-            Variable name of the filters in the forward pass,
-            or the gradient of the filters in backprop wrt. weights
-        top
-            Variable name of the output images / feature maps in the
-            forward pass, or the gradient of the outputs in the backprop passes
-        direction : {'forward', 'backprop weights', 'backprop inputs'}
-            "forward" to correlate bottom with weights and store results in top,
-            "backprop weights" to do a valid convolution of bottom with top
-            (swapping the first two dimensions) and store results in weights,
-            and "backprop inputs" to do a full convolution of top with weights
-            (swapping the first two dimensions) and store results in bottom.
-        sub
-            Dictionary of substitutions usable to help generating the C code.
-        height
-            Required if self.subsample[0] != 1, a variable giving the height of
-            the filters for direction="backprop weights" or the height of the
-            input images for direction="backprop inputs".
-            Required if self.border_mode == 'half', a variable giving the height
-            of the filters for direction="backprop weights".
-            Not required otherwise, but if a value is given this will be checked.
-        width
-            Required if self.subsample[1] != 1, a variable giving the width of
-            the filters for direction="backprop weights" or the width of the
-            input images for direction="backprop inputs".
-            Required if self.border_mode == 'half', a variable giving the width
-            of the filters for direction="backprop weights".
-            Not required otherwise, but if a value is given this will be checked.
-        """
-        dH, dW = self.subsample
-        dilH, dilW = self.filter_dilation
-        numgroups = self.num_groups
-        unshared = int(self.unshared)
-        if self.border_mode == "half":
-            padH_l = padH_r = padW_l = padW_r = -1
-        elif self.border_mode == "full":
-            padH_l = padH_r = padW_l = padW_r = -2
-        elif isinstance(self.border_mode, tuple):
-            (padH_l, padH_r), (padW_l, padW_r) = self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            padH_l = padH_r = padW_l = padW_r = 0
-        if direction == "forward":
-            direction = 0
-            out = top
-        elif direction == "backprop weights":
-            direction = 1
-            out = weights
-        elif direction == "backprop inputs":
-            direction = 2
-            out = bottom
-        else:
-            raise ValueError(
-                "direction must be one of 'forward', "
-                "'backprop weights', 'backprop inputs'"
-            )
-        # When subsampling, we cannot unambiguously infer the height and width
-        # of bottom and weights from top, so we require them to be given.
-        # Similarly, when pad="half", we cannot infer the weight size.
-        if height:
-            height = f"(*(npy_int*)(PyArray_DATA({height})))"
-        else:
-            if ((direction != 0) and (dH != 1)) or (
-                (direction == 1) and (padH_l == -1 or padH_r == -1)
-            ):
-                raise ValueError(
-                    "height must be given for backprop with vertical sampling or pad='half'"
-                )
-            height = "-1"
-        if width:
-            width = f"(*(npy_int*)(PyArray_DATA({width})))"
-        else:
-            if ((direction != 0) and (dW != 1)) or (
-                (direction == 1) and (padW_l == -1 or padW_r == -1)
-            ):
-                raise ValueError(
-                    "width must be given for backprop with horizontal sampling or pad='half'"
-                )
-            width = "-1"
-        sub = sub.copy()
-        sub.update(locals())
-        return (
-            """
-    // Mandatory args
-    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
-    // Optional args
-    size_t dH = %(dH)s;
-    size_t dW = %(dW)s;
-    size_t dilH = %(dilH)s;
-    size_t dilW = %(dilW)s;
-    int padH_l = %(padH_l)s;
-    int padH_r = %(padH_r)s;
-    int padW_l = %(padW_l)s;
-    int padW_r = %(padW_r)s;
-    int numgroups = %(numgroups)s;
-    int unshared = %(unshared)s;
-    PyGpuArrayObject * bottom = %(bottom)s;
-    PyGpuArrayObject * weights = %(weights)s;
-    PyGpuArrayObject * top = %(top)s;
-    PyGpuArrayObject * out2 = NULL;
-    int wdim, odim;
-    wdim = unshared ? 6 : 4;
-    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
-    // Obtain or infer kernel width and height
-    // (we need to know it early to be able to handle auto-padding)
-    size_t kH, kW, dil_kH, dil_kW;
-    if (direction != 1) {
-        // weight is an input variable, we can just read its shape
-        kH = PyGpuArray_DIMS(weights)[wdim-2];
-        kW = PyGpuArray_DIMS(weights)[wdim-1];
-    }
-    else {
-        if (%(height)s != -1) {
-            // kernel height is specified (perhaps vertical subsampling or half padding)
-            kH = %(height)s;
-        }
-        else if (padH_l == -2 || padH_r == -2) {
-            // vertical full padding, we can infer the kernel height
-            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
-        }
-        else {
-            // explicit padding, we can infer the kernel height
-            kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
-        }
-        if (%(width)s != -1) {
-            kW = %(width)s;
-        }
-        else if (padW_l == -2 || padW_r == -2) {
-            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }
-        else {
-            kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }
-    }
-    // Implicit dilated kernel size
-    dil_kH = (kH - 1) * dilH + 1;
-    dil_kW = (kW - 1) * dilW + 1;
-    // Auto-padding if requested
-    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
-        padH_l = padH_r = dil_kH / 2;
-    }
-    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
-        padH_l = padH_r = dil_kH - 1;
-    }
-    else if (padH_l < 0 || padH_r < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
-        %(fail)s
-    }
-    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
-        padW_l = padW_r = dil_kW / 2;
-    }
-    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
-        padW_l = padW_r = dil_kW - 1;
-    }
-    else if (padW_l < 0 || padW_r < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
-        %(fail)s
-    }
-    // Infer output shape and type
-    // The inferred shape can be negative.
-    long long out_dim[6];
-    size_t out_dim_size[6];
-    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
-    out_dim_size[4] = out_dim_size[5] = 0; //Same
-    int out_typecode;
-    PyGpuContextObject *out_context;
-    switch(direction) {
-    case 0:  // forward pass
-        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
-        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[0];
-        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
-        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
-        out_typecode = bottom->ga.typecode;
-        out_context = bottom->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-        {
-            if (unshared) {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuCorrMM: impossible output shape\\n"
-                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
-                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
-                %(fail)s
-            }
-            else {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuCorrMM: impossible output shape\\n"
-                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
-                %(fail)s
-            }
-        }
-        break;
-    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width) or
-        // (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
-        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
-        out_dim[0] = PyGpuArray_DIMS(top)[1];
-        if (unshared){
-            odim = 6;
-            out_dim[1] = PyGpuArray_DIMS(top)[2];
-            out_dim[2] = PyGpuArray_DIMS(top)[3];
-        }
-        out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
-        out_dim[wdim-2] = kH;  // already inferred further above
-        out_dim[wdim-1] = kW;  // how convenient
-        out_typecode = top->ga.typecode;
-        out_context = top->context;
-        if (unshared) {
-            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
-                    || out_dim[4] <= 0 || out_dim[5] <= 0){
-                PyErr_Format(PyExc_ValueError,
-                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
-                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                             out_dim[4], out_dim[5],
-                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
-                %(fail)s
-            }
-        }
-        else {
-             if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
-                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
-                %(fail)s
-            }
-        }
-        break;
-    case 2:  // backprop wrt. inputs
-        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
-        out_dim[0] = PyGpuArray_DIMS(top)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r;
-        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r;
-        out_typecode = top->ga.typecode;
-        out_context = top->context;
-        if (unshared) {
-            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
-                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
-                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
-                %(fail)s
-            }
-        }
-        else {
-            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
-                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
-                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
-                %(fail)s
-            }
-        }
-        break;
-    default:
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
-        %(fail)s
-    }
-    out_dim_size[0] = (size_t)out_dim[0];
-    out_dim_size[1] = (size_t)out_dim[1];
-    out_dim_size[2] = (size_t)out_dim[2];
-    out_dim_size[3] = (size_t)out_dim[3];
-    if (odim == 6) {
-        out_dim_size[4] = (size_t)out_dim[4];
-        out_dim_size[5] = (size_t)out_dim[5];
-    }
-    // Prepare output array
-    if (aesara_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
-    {
-        if (odim == 4) {
-            PyErr_Format(PyExc_RuntimeError,
-                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
-                    out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
-        }
-        if (odim == 6) {
-            PyErr_Format(PyExc_RuntimeError,
-                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
-                    out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
-        }
-        %(fail)s
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
-        PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
-        %(fail)s
-    }
-    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
-                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
-    if (out2==NULL){
-       %(fail)s
-    }
-    assert (out2 == %(out)s);
-"""
-            % sub
-        )
-class GpuCorrMM(BaseGpuCorrMM, _NoPythonCOp):
-    """
-    GPU correlation implementation using Matrix Multiplication.
-    Parameters
-    ----------
-    border_mode
-        The width of a border of implicit zeros to pad the
-        input with. Must be a tuple with 2 elements giving the numbers of rows
-        and columns to pad on each side, or a single integer to pad the same
-        on all sides, or a string shortcut setting the padding at runtime:
-        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
-        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
-        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
-        convolution for odd-sized kernels).
-        If it is a tuple containing 2 pairs of integers, then these specify
-        the padding to be applied on each side ((left, right), (top, bottom)).
-        Otherwise, each width is applied twice, once per side (left and right,
-        top and bottom).
-    subsample
-        The subsample operation applied to each output image.
-        Should be a tuple with 2 elements.
-        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
-        but faster.
-        Set to `(1, 1)` to disable subsampling.
-    filter_dilation
-        The filter dilation operation applied to each input image.
-        Should be a tuple with 2 elements.
-        Set to `(1, 1)` to disable filter dilation.
-    num_groups
-        The number of distinct groups the image and kernel must be
-        divided into.
-        should be an int
-        set to 1 to disable grouped convolution
-    unshared
-        Perform unshared correlation (default: False)
-    Notes
-    -----
-    Currently, the Op requires the inputs, filters and outputs to be
-    C-contiguous. Use :func:`gpu_contiguous
-    <aesara.gpuarray.basic_ops.gpu_contiguous>` on these arguments
-    if needed.
-    You can either enable the Aesara flag `optimizer_including=conv_gemm`
-    to automatically replace all convolution operations with `GpuCorrMM`
-    or one of its gradients, or you can use it as a replacement for
-    :func:`conv2d <aesara.tensor.nnet.conv.conv2d>`, called as
-    `GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
-    faster, but note that it computes a correlation -- if you need to
-    compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
-    """
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1),
-        filter_dilation=(1, 1),
-        num_groups=1,
-        unshared=False,
-    ):
-        super().__init__(border_mode, subsample, filter_dilation, num_groups, unshared)
-    def make_node(self, img, kern):
-        ctx_name = infer_context_name(img, kern)
-        img = as_gpuarray_variable(img, ctx_name)
-        kern = as_gpuarray_variable(kern, ctx_name)
-        if img.type.ndim != 4:
-            raise TypeError("img must be 4D tensor")
-        if self.unshared:
-            if kern.type.ndim != 6:
-                raise TypeError("kern must be 6D tensor")
-        else:
-            if kern.type.ndim != 4:
-                raise TypeError("kern must be 4D tensor")
-        broadcastable = [
-            img.type.broadcastable[0],
-            kern.type.broadcastable[0],
-            False,
-            False,
-        ]
-        return Apply(
-            self,
-            [img, kern],
-            [
-                GpuArrayType(
-                    dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
-                )()
-            ],
-        )
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, weights = inp
-        (top,) = out_
-        direction = "forward"
-        return super().c_code_helper(bottom, weights, top, direction, sub)
-    def grad(self, inp, grads):
-        bottom, weights = inp
-        (top,) = grads
-        top = gpu_contiguous(top)
-        d_bottom = GpuCorrMM_gradInputs(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(weights, top, bottom.shape[-2:])
-        d_weights = GpuCorrMM_gradWeights(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, top, weights.shape[-2:])
-        return d_bottom, d_weights
-class GpuCorrMM_gradWeights(BaseGpuCorrMM, _NoPythonCOp):
-    """
-    Gradient wrt. filters for `GpuCorrMM`.
-    Notes
-    -----
-    You will not want to use this directly, but rely on Aesara's automatic
-    differentiation or graph optimization to use it as needed.
-    """
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1),
-        filter_dilation=(1, 1),
-        num_groups=1,
-        unshared=False,
-    ):
-        super().__init__(border_mode, subsample, filter_dilation, num_groups, unshared)
-    def make_node(self, img, topgrad, shape=None):
-        ctx_name = infer_context_name(img, topgrad)
-        img = as_gpuarray_variable(img, ctx_name)
-        topgrad = as_gpuarray_variable(topgrad, ctx_name)
-        if img.type.ndim != 4:
-            raise TypeError("img must be 4D tensor")
-        if topgrad.type.ndim != 4:
-            raise TypeError("topgrad must be 4D tensor")
-        if shape is None:
-            if self.subsample != (1, 1) or self.border_mode == "half":
-                raise ValueError(
-                    "shape must be given if subsample != (1, 1)"
-                    ' or border_mode == "half"'
-                )
-            height_width = []
-        else:
-            height_width = [shape[0], shape[1]]
-            assert shape[0].ndim == 0
-            assert shape[1].ndim == 0
-        if self.unshared:
-            broadcastable = [
-                topgrad.type.broadcastable[1],
-                False,
-                False,
-                img.type.broadcastable[1],
-                False,
-                False,
-            ]
-        else:
-            broadcastable = [
-                topgrad.type.broadcastable[1],
-                img.type.broadcastable[1],
-                False,
-                False,
-            ]
-        return Apply(
-            self,
-            [img, topgrad] + height_width,
-            [
-                GpuArrayType(
-                    dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
-                )()
-            ],
-        )
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, top = inp[:2]
-        height, width = inp[2:] or (None, None)
-        (weights,) = out_
-        direction = "backprop weights"
-        return super().c_code_helper(
-            bottom, weights, top, direction, sub, height, width
-        )
-    def grad(self, inp, grads):
-        bottom, top = inp[:2]
-        (weights,) = grads
-        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorrMM_gradInputs(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(weights, top, bottom.shape[-2:])
-        d_top = GpuCorrMM(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, weights)
-        d_height_width = (
-            (aesara.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
-        )
-        return (d_bottom, d_top) + d_height_width
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0]]  # no connection to height, width
-class GpuCorrMM_gradInputs(BaseGpuCorrMM, _NoPythonCOp):
-    """
-    Gradient wrt. inputs for `GpuCorrMM`.
-    Notes
-    -----
-    You will not want to use this directly, but rely on Aesara's automatic
-    differentiation or graph optimization to use it as needed.
-    """
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1),
-        filter_dilation=(1, 1),
-        num_groups=1,
-        unshared=False,
-    ):
-        super().__init__(border_mode, subsample, filter_dilation, num_groups, unshared)
-    def make_node(self, kern, topgrad, shape=None):
-        ctx_name = infer_context_name(kern, topgrad)
-        kern = as_gpuarray_variable(kern, ctx_name)
-        topgrad = as_gpuarray_variable(topgrad, ctx_name)
-        if self.unshared:
-            if kern.type.ndim != 6:
-                raise TypeError("kern must be 6D tensor")
-        else:
-            if kern.type.ndim != 4:
-                raise TypeError("kern must be 4D tensor")
-        if topgrad.type.ndim != 4:
-            raise TypeError("topgrad must be 4D tensor")
-        if shape is None:
-            if self.subsample != (1, 1):
-                raise ValueError("shape must be given if subsample != (1, 1)")
-            height_width = []
-        else:
-            height_width = [shape[0], shape[1]]
-            assert shape[0].ndim == 0
-            assert shape[1].ndim == 0
-        if self.num_groups > 1:
-            broadcastable = [topgrad.type.broadcastable[0], False, False, False]
-        else:
-            broadcastable = [
-                topgrad.type.broadcastable[0],
-                kern.type.broadcastable[-3],
-                False,
-                False,
-            ]
-        return Apply(
-            self,
-            [kern, topgrad] + height_width,
-            [
-                GpuArrayType(
-                    dtype=topgrad.dtype,
-                    context_name=ctx_name,
-                    broadcastable=broadcastable,
-                )()
-            ],
-        )
-    def c_code(self, node, nodename, inp, out_, sub):
-        weights, top = inp[:2]
-        height, width = inp[2:] or (None, None)
-        (bottom,) = out_
-        direction = "backprop inputs"
-        return super().c_code_helper(
-            bottom, weights, top, direction, sub, height, width
-        )
-    def grad(self, inp, grads):
-        weights, top = inp[:2]
-        (bottom,) = grads
-        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorrMM_gradWeights(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, top, weights.shape[-2:])
-        d_top = GpuCorrMM(
-            self.border_mode,
-            self.subsample,
-            self.filter_dilation,
-            self.num_groups,
-            self.unshared,
-        )(bottom, weights)
-        d_height_width = (
-            (aesara.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
-        )
-        return (d_weights, d_top) + d_height_width
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0]]  # no connection to height, width
-class BaseGpuCorr3dMM(CGpuKernelBase, _NoPythonCOp):
-    """
-    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
-    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
-    Parameters
-    ----------
-    border_mode : {'valid', 'full', 'half'}
-        Additionally, the padding size could be directly specified by an integer
-        or a pair of integers
-    subsample
-        Perform subsampling of the output (default: (1, 1, 1)).
-    filter_dilation
-        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
-    num_groups :
-        Divides the image, kernel and output tensors into num_groups
-        separate groups. Each which carry out convolutions separately (default : 1).
-    """
-    check_broadcast = False
-    __props__ = ("border_mode", "subsample", "filter_dilation", "num_groups")
-    _f16_ok = True
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        filter_dilation=(1, 1, 1),
-        num_groups=1,
-    ):
-        if isinstance(border_mode, int):
-            border_mode = (border_mode, border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            pad_h, pad_w, pad_d = map(int, border_mode)
-            border_mode = (pad_h, pad_w, pad_d)
-        if not (
-            (isinstance(border_mode, tuple) and min(border_mode) >= 0)
-            or border_mode in ("valid", "full", "half")
-        ):
-            raise ValueError(
-                "invalid border_mode {}, which must be either "
-                '"valid", "full", "half", an integer or a tuple of'
-                " three integers".format(border_mode)
-            )
-        self.border_mode = border_mode
-        if len(subsample) != 3:
-            raise ValueError("subsample must have three elements")
-        if len(filter_dilation) != 3:
-            raise ValueError("filter_dilation must have three elements")
-        self.subsample = tuple(subsample)
-        self.filter_dilation = tuple(filter_dilation)
-        if num_groups < 1:
-            raise ValueError("Number of groups should be greater than 0")
-        self.num_groups = num_groups
-        CGpuKernelBase.__init__(self, ["c_code/corr3d_gemm.c"])
-    @property
-    def pad(self):
-        if self.border_mode != "valid":
-            return self.border_mode
-        return (0, 0, 0)
-    def __str__(self):
-        return "{}{{{}, {}, {}, {}}}".format(
-            self.__class__.__name__,
-            self.border_mode,
-            str(self.subsample),
-            str(self.filter_dilation),
-            str(self.num_groups),
-        )
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "num_groups"):
-            self.num_groups = 1
-    def flops(self, inp, outp):
-        """
-        Useful with the hack in profilemode to print the MFlops.
-        """
-        # if the output shape is correct, then this gives the correct
-        # flops for any direction, sampling, padding, and border mode
-        inputs, filters = inp
-        (outputs,) = outp
-        assert inputs[1] == (filters[1] * self.num_groups)
-        # nb mul and add by output pixel
-        flops = filters[2] * filters[3] * filters[4] * 2
-        # nb flops by output image
-        flops *= outputs[2] * outputs[3] * outputs[4]
-        # nb patch multiplied
-        flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
-        return flops
-    def c_headers(self, **kwargs):
-        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_code_cache_version(self):
-        # raise this whenever modifying the code below.
-        return (8,)
-    def c_code_helper(
-        self, bottom, weights, top, direction, sub, height=None, width=None, depth=None
-    ):
-        """
-        This generates the C code for GpuCorr3dMM (direction="forward"),
-        GpuCorr3dMM_gradWeights (direction="backprop weights"), and
-        GpuCorr3dMM_gradInputs (direction="backprop inputs").
-        Depending on the direction, one of bottom, weights, top will
-        receive the output, while the other two serve as inputs.
-        Parameters
-        ----------
-        bottom
-            Variable name of the input images in the forward pass,
-            or the gradient of the input images in backprop wrt. inputs
-        weights
-            Variable name of the filters in the forward pass,
-            or the gradient of the filters in backprop wrt. weights
-        top
-            Variable name of the output images / feature maps in the
-            forward pass, or the gradient of the outputs in the backprop passes
-        direction : {'forward', 'backprop weights', 'backprop inputs'}
-            "forward" to correlate bottom with weights and store results in top,
-            "backprop weights" to do a valid convolution of bottom with top
-            (swapping the first two dimensions) and store results in weights,
-            and "backprop inputs" to do a full convolution of top with weights
-            (swapping the first two dimensions) and store results in bottom.
-        sub
-            Dictionary of substitutions usable to help generating the C code.
-        height
-            Required if self.subsample[0] != 1, a variable giving the height of
-            the filters for direction="backprop weights" or the height of the
-            input images for direction="backprop inputs".
-            Required if self.border_mode == 'half', a variable giving the height
-            of the filters for direction="backprop weights".
-            Not required otherwise, but if a value is given this will be checked.
-        width
-            Required if self.subsample[1] != 1, a variable giving the width of
-            the filters for direction="backprop weights" or the width of the
-            input images for direction="backprop inputs".
-            Required if self.border_mode == 'half', a variable giving the width
-            of the filters for direction="backprop weights".
-            Not required otherwise, but if a value is given this will be checked.
-        depth
-            Required if self.subsample[2] != 1, a variable giving the depth of
-            the filters for direction="backprop weights" or the depth of the
-            input images for direction="backprop inputs".
-            Required if self.border_mode == 'half', a variable giving the depth
-            of the filters for direction="backprop weights".
-            Not required otherwise, but if a value is given this will be checked.
-        """
-        dH, dW, dD = self.subsample
-        dilH, dilW, dilD = self.filter_dilation
-        numgroups = self.num_groups
-        if self.border_mode == "half":
-            padH = padW = padD = -1
-        elif self.border_mode == "full":
-            padH = padW = padD = -2
-        elif isinstance(self.border_mode, tuple):
-            padH, padW, padD = self.border_mode
-        else:
-            assert self.border_mode == "valid"
-            padH = padW = padD = 0
-        if direction == "forward":
-            direction = 0
-            out = top
-        elif direction == "backprop weights":
-            direction = 1
-            out = weights
-        elif direction == "backprop inputs":
-            direction = 2
-            out = bottom
-        else:
-            raise ValueError(
-                "direction must be one of 'forward', "
-                "'backprop weights', 'backprop inputs'"
-            )
-        # When subsampling, we cannot unambiguously infer the height and width
-        # of bottom and weights from top, so we require them to be given.
-        # Similarly, when pad="half", we cannot infer the weight size.
-        if height:
-            height = f"(*(npy_int*)(PyArray_DATA({height})))"
-        else:
-            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
-                raise ValueError(
-                    "height must be given for backprop with vertical sampling or pad='half'"
-                )
-            height = "-1"
-        if width:
-            width = f"(*(npy_int*)(PyArray_DATA({width})))"
-        else:
-            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
-                raise ValueError(
-                    "width must be given for backprop with horizontal sampling or pad='half'"
-                )
-            width = "-1"
-        if depth:
-            depth = f"(*(npy_int*)(PyArray_DATA({depth})))"
-        else:
-            if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
-                raise ValueError(
-                    "depth must be given for backprop with horizontal sampling or pad='half'"
-                )
-            depth = "-1"
-        sub = sub.copy()
-        sub.update(locals())
-        return (
-            """
-    // Mandatory args
-    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
-    // Optional args
-    size_t dH = %(dH)s;
-    size_t dW = %(dW)s;
-    size_t dD = %(dD)s;
-    size_t dilH = %(dilH)s;
-    size_t dilW = %(dilW)s;
-    size_t dilD = %(dilD)s;
-    int padH = %(padH)s;
-    int padW = %(padW)s;
-    int padD = %(padD)s;
-    int numgroups = %(numgroups)s;
-    PyGpuArrayObject * bottom = %(bottom)s;
-    PyGpuArrayObject * weights = %(weights)s;
-    PyGpuArrayObject * top = %(top)s;
-    PyGpuArrayObject * out2 = NULL;
-    // Obtain or infer kernel height, width and depth
-    // (we need to know it early to be able to handle auto-padding)
-    size_t kH, kW, kD, dil_kH, dil_kW, dil_kD;
-    if (direction != 1) {
-        // weight is an input variable, we can just read its shape
-        kH = PyGpuArray_DIMS(weights)[2];
-        kW = PyGpuArray_DIMS(weights)[3];
-        kD = PyGpuArray_DIMS(weights)[4];
-    }
-    else {
-        if (%(height)s != -1) {
-            // kernel height is specified (perhaps vertical subsampling or half padding)
-            kH = %(height)s;
-        }
-        else if (padH == -2) {
-            // vertical full padding, we can infer the kernel height
-            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
-        }
-        else {
-            // explicit padding, we can infer the kernel height
-            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
-        }
-        if (%(width)s != -1) {
-            kW = %(width)s;
-        }
-        else if (padW == -2) {
-            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }
-        else {
-            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
-        }
-        if (%(depth)s != -1) {
-            kD = %(depth)s;
-        }
-        else if (padD == -2) {
-            kD = (2 - PyGpuArray_DIMS(bottom)[4] + (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
-        }
-        else {
-            kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
-        }
-    }
-    // Implicit dilated kernel size
-    dil_kH = (kH - 1) * dilH + 1;
-    dil_kW = (kW - 1) * dilW + 1;
-    dil_kD = (kD - 1) * dilD + 1;
-    // Auto-padding if requested
-    if (padH == -1) {  // vertical half padding
-        padH = dil_kH / 2;
-    }
-    else if (padH == -2) {  // vertical full padding
-        padH = dil_kH - 1;
-    }
-    else if (padH < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
-        %(fail)s
-    }
-    if (padW == -1) {  // horizontal half padding
-        padW = dil_kW / 2;
-    }
-    else if (padW == -2) {  // horizontal full padding
-        padW = dil_kW - 1;
-    }
-    else if (padW < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
-        %(fail)s
-    }
-    if (padD == -1) {  // depth half padding
-        padD = dil_kD / 2;
-    }
-    else if (padD == -2) {  // depth full padding
-        padD = dil_kD - 1;
-    }
-    else if (padD < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2");
-        %(fail)s
-    }
-    // Infer output shape and type
-    // The inferred shape can be negative.
-    long long out_dim[5];
-    size_t out_dim_size[5];
-    int out_typecode;
-    PyGpuContextObject *out_context;
-    switch(direction) {
-    case 0:  // forward pass
-        // output is top: (batchsize, num_filters, height, width, depth)
-        // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
-        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[0];
-        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
-        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
-        out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
-        out_typecode = bottom->ga.typecode;
-        out_context = bottom->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuCorr3dMM: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                         PyGpuArray_DIMS(bottom)[4],
-                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                         PyGpuArray_DIMS(weights)[4],
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
-            %(fail)s
-        }
-        break;
-    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width, depth)
-        // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
-        out_dim[0] = PyGpuArray_DIMS(top)[1];
-        out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
-        out_dim[2] = kH;  // already inferred further above
-        out_dim[3] = kW;  // how convenient
-        out_dim[4] = kD;
-        out_typecode = top->ga.typecode;
-        out_context = top->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuCorr3dMM backprop wrt. weights: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                         PyGpuArray_DIMS(bottom)[4],
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
-                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
-                         PyGpuArray_DIMS(top)[4]);
-            %(fail)s
-        }
-        break;
-    case 2:  // backprop wrt. inputs
-        // output is bottom: (batchsize, num_channels, height, width, depth)
-        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
-        out_dim[0] = PyGpuArray_DIMS(top)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
-        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
-        out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
-        out_typecode = top->ga.typecode;
-        out_context = top->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuCorr3dMM backprop wrt. inputs: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4],
-                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                         PyGpuArray_DIMS(weights)[4],
-                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
-                         PyGpuArray_DIMS(top)[4]);
-            %(fail)s
-        }
-        break;
-    default:
-        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
-        %(fail)s
-    }
-    out_dim_size[0] = (size_t)out_dim[0];
-    out_dim_size[1] = (size_t)out_dim[1];
-    out_dim_size[2] = (size_t)out_dim[2];
-    out_dim_size[3] = (size_t)out_dim[3];
-    out_dim_size[4] = (size_t)out_dim[4];
-    // Prepare output array
-    if (aesara_prep_output(&%(out)s, 5, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
-    {
-        PyErr_Format(PyExc_RuntimeError,
-                "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
-                out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
-        %(fail)s
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
-        PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
-        %(fail)s
-    }
-    // Call GPU code
-    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
-                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD, numgroups);
-    if (out2==NULL){
-       %(fail)s
-    }
-    assert (out2 == %(out)s);
-"""
-            % sub
-        )
-class GpuCorr3dMM(BaseGpuCorr3dMM, _NoPythonCOp):
-    """
-    GPU correlation implementation using Matrix Multiplication.
-    Parameters
-    ----------
-    border_mode
-        The width of a border of implicit zeros to pad the
-        input with. Must be a tuple with 3 elements giving the width of
-        the padding on each side, or a single integer to pad the same
-        on all sides, or a string shortcut setting the padding at runtime:
-        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
-        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
-        (full convolution), ``'half'`` for ``(kernel_rows // 2,
-        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
-        odd-sized kernels). Note that the three widths are each
-        applied twice, once per side (left and right, top and bottom, front
-        and back).
-    subsample
-        The subsample operation applied to each output image. Should be a tuple
-        with 3 elements. `(sv, sh, sl)` is equivalent to
-        `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
-        Set to `(1, 1, 1)` to disable subsampling.
-    filter_dilation
-        The filter dilation operation applied to each input image.
-        Should be a tuple with 3 elements.
-        Set to `(1, 1, 1)` to disable filter dilation.
-    num_groups
-        The number of distinct groups the image and kernel must be
-        divided into.
-        should be an int
-        set to 1 to disable grouped convolution
-    Notes
-    -----
-    Currently, the Op requires the inputs, filters and outputs to be
-    C-contiguous. Use :func:`gpu_contiguous
-    <aesara.gpuarray.basic_ops.gpu_contiguous>` on these arguments
-    if needed.
-    You can either enable the Aesara flag `optimizer_including=conv_gemm`
-    to automatically replace all convolution operations with `GpuCorr3dMM`
-    or one of its gradients, or you can use it as a replacement for
-    :func:`conv2d <aesara.tensor.nnet.conv.conv2d>`, called as
-    `GpuCorr3dMM(subsample=...)(image, filters)`. The latter is currently
-    faster, but note that it computes a correlation -- if you need to
-    compute a convolution, flip the filters as `filters[:,:,::-1,::-1,::-1]`.
-    """
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        filter_dilation=(1, 1, 1),
-        num_groups=1,
-    ):
-        super().__init__(border_mode, subsample, filter_dilation, num_groups)
-    def make_node(self, img, kern):
-        ctx_name = infer_context_name(img, kern)
-        img = as_gpuarray_variable(img, ctx_name)
-        kern = as_gpuarray_variable(kern, ctx_name)
-        if img.type.ndim != 5:
-            raise TypeError("img must be 5D tensor")
-        if kern.type.ndim != 5:
-            raise TypeError("kern must be 5D tensor")
-        broadcastable = [
-            img.type.broadcastable[0],
-            kern.type.broadcastable[0],
-            False,
-            False,
-            False,
-        ]
-        return Apply(
-            self,
-            [img, kern],
-            [
-                GpuArrayType(
-                    dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
-                )()
-            ],
-        )
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, weights = inp
-        (top,) = out_
-        direction = "forward"
-        return super().c_code_helper(bottom, weights, top, direction, sub)
-    def grad(self, inp, grads):
-        bottom, weights = inp
-        (top,) = grads
-        top = gpu_contiguous(top)
-        d_bottom = GpuCorr3dMM_gradInputs(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups
-        )(weights, top, bottom.shape[-3:])
-        d_weights = GpuCorr3dMM_gradWeights(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups
-        )(bottom, top, weights.shape[-3:])
-        return d_bottom, d_weights
-class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM, _NoPythonCOp):
-    """
-    Gradient wrt. filters for `GpuCorr3dMM`.
-    Notes
-    -----
-    You will not want to use this directly, but rely on Aesara's automatic
-    differentiation or graph optimization to use it as needed.
-    """
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        filter_dilation=(1, 1, 1),
-        num_groups=1,
-    ):
-        super().__init__(border_mode, subsample, filter_dilation, num_groups)
-    def make_node(self, img, topgrad, shape=None):
-        ctx_name = infer_context_name(img, topgrad)
-        img = as_gpuarray_variable(img, ctx_name)
-        topgrad = as_gpuarray_variable(topgrad, ctx_name)
-        if img.type.ndim != 5:
-            raise TypeError("img must be 5D tensor")
-        if topgrad.type.ndim != 5:
-            raise TypeError("topgrad must be 5D tensor")
-        if shape is None:
-            if self.subsample != (1, 1, 1) or self.border_mode == "half":
-                raise ValueError(
-                    "shape must be given if subsample != (1, 1, 1)"
-                    ' or border_mode == "half"'
-                )
-            height_width_depth = []
-        else:
-            height_width_depth = [shape[0], shape[1], shape[2]]
-            assert shape[0].ndim == 0
-            assert shape[1].ndim == 0
-            assert shape[2].ndim == 0
-        broadcastable = [
-            topgrad.type.broadcastable[1],
-            img.type.broadcastable[1],
-            False,
-            False,
-            False,
-        ]
-        return Apply(
-            self,
-            [img, topgrad] + height_width_depth,
-            [
-                GpuArrayType(
-                    dtype=img.dtype, context_name=ctx_name, broadcastable=broadcastable
-                )()
-            ],
-        )
-    def c_code(self, node, nodename, inp, out_, sub):
-        bottom, top = inp[:2]
-        height, width, depth = inp[2:] or (None, None, None)
-        (weights,) = out_
-        direction = "backprop weights"
-        return super().c_code_helper(
-            bottom, weights, top, direction, sub, height, width, depth
-        )
-    def grad(self, inp, grads):
-        bottom, top = inp[:2]
-        (weights,) = grads
-        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorr3dMM_gradInputs(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups
-        )(weights, top, bottom.shape[-3:])
-        d_top = GpuCorr3dMM(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups
-        )(bottom, weights)
-        d_height_width_depth = (
-            (aesara.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
-        )
-        return (d_bottom, d_top) + d_height_width_depth
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
-class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM, _NoPythonCOp):
-    """
-    Gradient wrt. inputs for `GpuCorr3dMM`.
-    Notes
-    -----
-    You will not want to use this directly, but rely on Aesara's automatic
-    differentiation or graph optimization to use it as needed.
-    """
-    def __init__(
-        self,
-        border_mode="valid",
-        subsample=(1, 1, 1),
-        filter_dilation=(1, 1, 1),
-        num_groups=1,
-    ):
-        super().__init__(border_mode, subsample, filter_dilation, num_groups)
-    def make_node(self, kern, topgrad, shape=None):
-        ctx_name = infer_context_name(kern, topgrad)
-        kern = as_gpuarray_variable(kern, ctx_name)
-        topgrad = as_gpuarray_variable(topgrad, ctx_name)
-        if kern.type.ndim != 5:
-            raise TypeError("kern must be 5D tensor")
-        if topgrad.type.ndim != 5:
-            raise TypeError("topgrad must be 5D tensor")
-        if shape is None:
-            if self.subsample != (1, 1, 1):
-                raise ValueError("shape must be given if subsample != (1, 1, 1)")
-            height_width_depth = []
-        else:
-            height_width_depth = [shape[0], shape[1], shape[2]]
-            assert shape[0].ndim == 0
-            assert shape[1].ndim == 0
-            assert shape[2].ndim == 0
-        if self.num_groups > 1:
-            broadcastable = [topgrad.type.broadcastable[0], False, False, False, False]
-        else:
-            broadcastable = [
-                topgrad.type.broadcastable[0],
-                kern.type.broadcastable[-4],
-                False,
-                False,
-                False,
-            ]
-        return Apply(
-            self,
-            [kern, topgrad] + height_width_depth,
-            [
-                GpuArrayType(
-                    dtype=topgrad.dtype,
-                    context_name=ctx_name,
-                    broadcastable=broadcastable,
-                )()
-            ],
-        )
-    def c_code(self, node, nodename, inp, out_, sub):
-        weights, top = inp[:2]
-        height, width, depth = inp[2:] or (None, None, None)
-        (bottom,) = out_
-        direction = "backprop inputs"
-        return super().c_code_helper(
-            bottom, weights, top, direction, sub, height, width, depth
-        )
-    def grad(self, inp, grads):
-        weights, top = inp[:2]
-        (bottom,) = grads
-        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorr3dMM_gradWeights(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups
-        )(bottom, top, weights.shape[-3:])
-        d_top = GpuCorr3dMM(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups
-        )(bottom, weights)
-        d_height_width_depth = (
-            (aesara.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
-        )
-        return (d_weights, d_top) + d_height_width_depth
-    def connection_pattern(self, node):
-        if node.nin == 2:
-            return [[1], [1]]
-        else:
-            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
-@inplace_allocempty(GpuGemv, 0)
-def local_inplace_gpuagemv(node, inputs):
-    return [gpugemv_inplace(*inputs)]
-@inplace_allocempty(GpuGemm, 0)
-def local_inplace_gpuagemm(node, inputs):
-    return [gpugemm_inplace(*inputs)]
-@inplace_allocempty(GpuGer, 0)
-def local_inplace_gpuager(node, inputs):
-    return [gpuger_inplace(*inputs)]
-@inplace_allocempty(GpuGemmBatch, 0)
-def local_inplace_gpuagemmbatch(node, inputs):
-    return [gpugemmbatch_inplace(*inputs)]
-gpuablas_opt_inplace = in2out(
-    LocalOptGroup(
-        local_inplace_gpuagemv,
-        local_inplace_gpuagemm,
-        local_inplace_gpuager,
-        local_inplace_gpuagemmbatch,
-    ),
-    name="gpuablas_opt_inplace",
-)
-optdb.register(
-    "InplaceGpuaBlasOpt",
-    gpuablas_opt_inplace,
-    "fast_run",
-    "inplace",
-    "gpuarray",
-    position=70.0,
-)
--- a/aesara/gpuarray/blocksparse.py
+++ b/aesara/gpuarray/blocksparse.py
-import logging
-import numpy as np
-from aesara import tensor as at
-from aesara.gpuarray.basic_ops import (
-    as_gpuarray_variable,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.type import gpu_context_type
-from aesara.gradient import grad_undefined
-from aesara.graph.basic import Apply
-from aesara.link.c.op import _NoPythonExternalCOp
-from aesara.link.c.params_type import ParamsType
-from aesara.scalar import bool as bool_t
-from aesara.tensor import as_tensor_variable
-from aesara.tensor.type import discrete_dtypes
-_logger = logging.getLogger("aesara.gpuarray.blocksparse")
-class GpuSparseBlockGemv(_NoPythonExternalCOp):
-    """
-    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
-    information.
-    This should not be directly called since the interface is subject
-    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
-    function for a stable interface.
-    """
-    __props__ = ("inplace",)
-    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
-    # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.
-    def __init__(self, inplace=False):
-        super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_headers(self, **kwargs):
-        return [
-            "<gpuarray/buffer_blas.h>",
-            "<gpuarray/buffer.h>",
-            "<gpuarray_helper.h>",
-        ]
-    def make_node(self, o, W, h, inputIdx, outputIdx):
-        ctx = infer_context_name(o, W, h)
-        o = as_gpuarray_variable(o, ctx)
-        W = as_gpuarray_variable(W, ctx)
-        h = as_gpuarray_variable(h, ctx)
-        inputIdx = as_tensor_variable(inputIdx)
-        outputIdx = as_tensor_variable(outputIdx)
-        assert o.ndim == 3
-        assert W.ndim == 4
-        assert h.ndim == 3
-        assert inputIdx.ndim == 2
-        assert outputIdx.ndim == 2
-        assert inputIdx.type.dtype in discrete_dtypes
-        assert outputIdx.type.dtype in discrete_dtypes
-        return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])
-    def infer_shape(self, fgraph, node, input_shapes):
-        return [input_shapes[0]]
-    def grad(self, inputs, grads):
-        o, W, h, inputIdx, outputIdx = inputs
-        go = grads[0]
-        Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx)
-        hgrad = gpu_sparse_block_gemv(
-            h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx
-        )
-        return [
-            go,
-            Wgrad,
-            hgrad,
-            grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"),
-            grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"),
-        ]
-gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
-gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
-class GpuSparseBlockOuter(_NoPythonExternalCOp):
-    """
-    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
-    information.
-    This op should not be called directly since its interface is
-    subject to change without notice.  It is involved in the gradient
-    of GpuSparseBlockGemv. The gradient is not implemented.
-    """
-    __props__ = ("inplace",)
-    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
-    def __init__(self, inplace=False):
-        super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
-        ctx = infer_context_name(o, x, y)
-        one = at.constant(np.asarray(1.0, dtype="float32"))
-        o = as_gpuarray_variable(o, ctx)
-        x = as_gpuarray_variable(x, ctx)
-        y = as_gpuarray_variable(y, ctx)
-        xIdx = as_tensor_variable(xIdx)
-        yIdx = as_tensor_variable(yIdx)
-        if alpha is None:
-            alpha = one
-        return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
-    def infer_shape(self, fgraph, node, input_shapes):
-        return [input_shapes[0]]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_headers(self, **kwargs):
-        return [
-            "<gpuarray/buffer_blas.h>",
-            "<gpuarray/buffer.h>",
-            "<gpuarray_helper.h>",
-        ]
-gpu_sparse_block_outer = GpuSparseBlockOuter(False)
-gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
--- a/aesara/gpuarray/c_code/blockgemv.c
+++ b/aesara/gpuarray/c_code/blockgemv.c
-#section support_code_apply
-int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
-                              PyGpuArrayObject *h, PyArrayObject *inputIdx,
-                              PyArrayObject *outputIdx,
-                              PyGpuArrayObject **_out,
-                              PARAMS_TYPE* params) {
-  PyGpuArrayObject *out = *_out;
-  if (params->inplace) {
-    Py_XDECREF(out);
-    out = o;
-    Py_INCREF(out);
-  } else {
-    out = aesara_try_copy(out, o);
-    if (out == NULL) {
-      // Error already set
-      return -1;
-    }
-  }
-  gpudata **W_list = NULL;
-  gpudata **inp_list = NULL;
-  gpudata **out_list = NULL;
-  size_t *offW = NULL;
-  size_t *offInp = NULL;
-  size_t *offOut = NULL;
-  int err;
-  err = gpublas_setup(params->context->ctx);
-  if (err != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
-    return -1;
-  }
-  /* Prepare lists for the batch */
-  size_t maxi = PyGpuArray_DIMS(h)[1];
-  size_t maxj = PyGpuArray_DIMS(out)[1];
-  size_t maxb = PyGpuArray_DIMS(out)[0];
-  ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
-  ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
-  ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
-  ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
-  ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
-  ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
-  W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
-  offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
-  inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
-  offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
-  out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
-  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
-  if (W_list == NULL || offW == NULL ||
-      inp_list == NULL || offInp == NULL ||
-      out_list == NULL || offOut == NULL) {
-    free(W_list);
-    free(offW);
-    free(inp_list);
-    free(offInp);
-    free(out_list);
-    free(offOut);
-    PyErr_NoMemory();
-    return -1;
-  }
-  for (size_t i = 0; i < maxi; i++) {
-    for (size_t j = 0; j < maxj; j++) {
-      for (size_t b = 0; b < maxb; b++) {
-        size_t p = i + j * maxi + b * maxi * maxj;
-        inp_list[p] = h->ga.data;
-        offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
-        out_list[p] = out->ga.data;
-        offOut[p] = b * o_str_0 + j * o_str_1 + out->ga.offset;
-        W_list[p] = W->ga.data;
-        offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
-          *(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
-          W->ga.offset;
-      }
-    }
-  }
-  cb_transpose transA = cb_no_trans;
-  size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
-  if (lda == 1) {
-    transA = cb_trans;
-    lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
-  }
-  if (out->ga.typecode == GA_FLOAT) {
-    err = gpublas_sgemvBatch(cb_fortran, transA,
-                             PyGpuArray_DIMS(out)[2],
-                             PyGpuArray_DIMS(h)[2], 1,
-                             W_list, offW, lda,
-                             inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
-                             1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
-                             PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
-  } else if (out->ga.typecode == GA_DOUBLE) {
-    err = gpublas_dgemvBatch(cb_fortran, transA,
-                             PyGpuArray_DIMS(out)[2],
-                             PyGpuArray_DIMS(h)[2], 1,
-                             W_list, offW, lda,
-                             inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
-                             1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
-                             PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
-  } else if (out->ga.typecode == GA_HALF) {
-    err = gpublas_sgemvBatch(cb_fortran, transA,
-                             PyGpuArray_DIMS(out)[2],
-                             PyGpuArray_DIMS(h)[2], 1,
-                             W_list, offW, lda,
-                             inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
-                             1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
-                             PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
-  } else {
-    err = GA_INVALID_ERROR;
-  }
-  free(W_list);
-  free(offW);
-  free(inp_list);
-  free(offInp);
-  free(out_list);
-  free(offOut);
-  if (err != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
-    return -1;
-  }
-  *_out = out;
-  return 0;
-}
--- a/aesara/gpuarray/c_code/blockger.c
+++ b/aesara/gpuarray/c_code/blockger.c
-#section support_code_apply
-int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
-                             PyGpuArrayObject *y, PyArrayObject *xIdx,
-                             PyArrayObject *yIdx, PyArrayObject *alpha,
-                             PyGpuArrayObject **_out,
-                             PARAMS_TYPE* params) {
-  PyGpuArrayObject *out = *_out;
-  gpudata **o_list = NULL;
-  gpudata **x_list = NULL;
-  gpudata **y_list = NULL;
-  size_t *offOut = NULL;
-  size_t *offX = NULL;
-  size_t *offY = NULL;
-  int err;
-  err = gpublas_setup(params->context->ctx);
-  if (err != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
-    return -1;
-  }
-  if (params->inplace) {
-    Py_XDECREF(out);
-    out = o;
-    Py_INCREF(out);
-  } else {
-    out = aesara_try_copy(out, o);
-    if (out == NULL)
-      return -1;
-  }
-  size_t maxi = PyGpuArray_DIMS(x)[1];
-  size_t maxj = PyGpuArray_DIMS(y)[1];
-  size_t maxb = PyGpuArray_DIMS(x)[0];
-  ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
-  ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
-  ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
-  ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
-  ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
-  ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
-  o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
-  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
-  x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
-  offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
-  y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
-  offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
-  if (o_list == NULL || offOut == NULL ||
-      x_list == NULL || offX == NULL ||
-      y_list == NULL || offY == NULL) {
-    free(o_list);
-    free(offOut);
-    free(x_list);
-    free(offX);
-    free(y_list);
-    free(offY);
-    PyErr_NoMemory();
-    return -1;
-  }
-  for (size_t i = 0; i < maxi; i++) {
-    for (size_t j = 0; j < maxj; j++) {
-      for (size_t b = 0; b < maxb; b++) {
-        size_t p = i + j * maxi + b * maxi * maxj;
-        x_list[p] = x->ga.data;
-        offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
-        y_list[p] = y->ga.data;
-        offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
-        o_list[p] = out->ga.data;
-        offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
-      }
-    }
-  }
-  ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
-  ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
-  ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
-  if (out->ga.typecode == GA_FLOAT) {
-    err = gpublas_sgerBatch(cb_fortran,
-                            PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
-                            *(float *)PyArray_GETPTR1(alpha, 0),
-                            y_list, offY, str_y, x_list, offX, str_x,
-                            o_list, offOut, str_out,
-                            PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
-  } else if (out->ga.typecode == GA_DOUBLE) {
-    err = gpublas_dgerBatch(cb_fortran,
-                            PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
-                            *(double *)PyArray_GETPTR1(alpha, 0),
-                            y_list, offY, str_y, x_list, offX, str_x,
-                            o_list, offOut, str_out,
-                            PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
-  } else if (out->ga.typecode == GA_HALF) {
-    err = gpublas_hgerBatch(cb_fortran,
-                            PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
-                            *(float *)PyArray_GETPTR1(alpha, 0),
-                            y_list, offY, str_y, x_list, offX, str_x,
-                            o_list, offOut, str_out,
-                            PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
-  } else {
-    err = GA_INVALID_ERROR;
-  }
-  free(o_list);
-  free(offOut);
-  free(x_list);
-  free(offX);
-  free(y_list);
-  free(offY);
-  if (err != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "gerBatch failed");
-    return -1;
-  }
-  *_out = out;
-  return 0;
-}
--- a/aesara/gpuarray/c_code/conv_desc.c
+++ b/aesara/gpuarray/c_code/conv_desc.c
-#section support_code_apply
-static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
-#if CUDNN_MAJOR >= 7
-  cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-		   "error setting groups for convolution : %s",
-		   cudnnGetErrorString(err));
-    return -1;
-  }
-#endif
-  return 0;
-}
-int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
-                              cudnnConvolutionDescriptor_t *desc,
-                              PARAMS_TYPE* params) {
-  cudnnStatus_t err;
-  int pad[3] = {params->pad0, params->pad1, params->pad2};
-  int strides[3] = {params->sub0, params->sub1, params->sub2};
-  int dilation[3] = {params->dil0, params->dil1, params->dil2};
-  if (params->bmode == BORDER_MODE_FULL) {
-    pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
-    pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
-    if (params->nb_dims > 2) {
-      pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
-    }
-  } else if(params->bmode == BORDER_MODE_HALF) {
-    pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
-    pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
-    if (params->nb_dims > 2) {
-      pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
-    }
-  }
-  if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
-    PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
-                 "expected %d, got %lld.", params->nb_dims,
-                 (long long)PyArray_DIM(filt_shp, 0));
-    return -1;
-  }
-  err = cudnnCreateConvolutionDescriptor(desc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
-                 "descriptor: %s", cudnnGetErrorString(err));
-    return -1;
-  }
-  err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
-                                        dilation, params->conv_mode, params->precision);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not set convolution "
-                 "descriptor: %s", cudnnGetErrorString(err));
-    return -1;
-  }
-  if (c_set_groups_for_conv(*desc, params->num_groups) == -1)
-      return -1;
-  return 0;
-}
--- a/aesara/gpuarray/c_code/corr3d_gemm.c
+++ b/aesara/gpuarray/c_code/corr3d_gemm.c
-#section kernels
-#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
-// Kernels for fast unfold + copy
-// GPU kernel for the case of dilation
-KERNEL void dilated_im3d2col_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset,
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-    const ga_size height, const ga_size width, const ga_size depth,
-    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
-    const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
-    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
-    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
-    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
-    const ga_size offset_col) {
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    const ga_size w_index = index / depth_col;
-    const ga_size h_index = w_index / width_col;
-    const ga_size d_col = index % depth_col;
-    const ga_size h_col = h_index % height_col;
-    const ga_size w_col = w_index % width_col;
-    const ga_size c_im = h_index / height_col;
-    const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
-    const ga_size h_offset = h_col * stride_h - pad_h;
-    const ga_size w_offset = w_col * stride_w - pad_w;
-    const ga_size d_offset = d_col * stride_d - pad_d;
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
-    data_col_ptr += c_col * (height_col * width_col * depth_col) +
-      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
-    data_im_ptr += c_im * (height * width * depth) +
-      h_offset * (width * depth) + w_offset * depth + d_offset;
-    for (ga_size i = 0; i < kernel_h; ++i) {
-      ga_size h_im = h_offset + i * dilation_h;
-      for (ga_size j = 0; j < kernel_w; ++j) {
-        ga_size w_im = w_offset + j * dilation_w;
-        for (ga_size k = 0; k < kernel_d; ++k) {
-          ga_size d_im = d_offset + k * dilation_d;
-          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
-                           h_im < height && w_im < width && d_im < depth) ?
-                           data_im_ptr[i * dilation_h * (width * depth) +
-                                       j * dilation_w * depth +
-                                       k * dilation_d] : 0;
-          data_col_ptr += height_col * width_col * depth_col;
-        }
-      }
-    }
-  }
-}
-#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-KERNEL void im3d2col_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset,
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-    const ga_size height, const ga_size width, const ga_size depth,
-    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
-    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
-    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
-    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
-    const ga_size offset_col) {
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    const ga_size w_index = index / depth_col;
-    const ga_size h_index = w_index / width_col;
-    const ga_size d_col = index % depth_col;
-    const ga_size h_col = h_index % height_col;
-    const ga_size w_col = w_index % width_col;
-    const ga_size c_im = h_index / height_col;
-    const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
-    const ga_size h_offset = h_col * stride_h - pad_h;
-    const ga_size w_offset = w_col * stride_w - pad_w;
-    const ga_size d_offset = d_col * stride_d - pad_d;
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
-    data_col_ptr += c_col * (height_col * width_col * depth_col) +
-      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
-    data_im_ptr += c_im * (height * width * depth) +
-      h_offset * (width * depth) + w_offset * depth + d_offset;
-    for (ga_size i = 0; i < kernel_h; ++i) {
-      ga_size h_im = h_offset + i;
-      for (ga_size j = 0; j < kernel_w; ++j) {
-        ga_size w_im = w_offset + j;
-        for (ga_size k = 0; k < kernel_d; ++k) {
-          ga_size d_im = d_offset + k;
-          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
-                           h_im < height && w_im < width && d_im < depth) ?
-                           data_im_ptr[i * (width * depth) + j * depth + k] : 0;
-          data_col_ptr += height_col * width_col * depth_col;
-        }
-      }
-    }
-  }
-}
-// GPU kernel for the case of dilation
-#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
-#include "cluda.h"
-KERNEL void dilated_col2im3d_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
-    const ga_size offset_col,
-    const ga_size height, const ga_size width, const ga_size depth,
-    const ga_size channels,
-    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
-    const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
-    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
-    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
-    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset) {
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    DTYPE_INPUT_0 val = 0;
-    const ga_size d_im = index % depth + pad_d;
-    const ga_size w_index = index / depth;
-    const ga_size w_im = w_index % width + pad_w;
-    const ga_size h_index = w_index / width;
-    const ga_size h_im = h_index % height + pad_h;
-    const ga_size c_im = h_index / height;
-    ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
-    ga_size kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
-    // compute the start and end of the output
-    const ga_size d_col_start =
-        (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
-    const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
-    const ga_size w_col_start =
-        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
-    const ga_size h_col_start =
-        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
-    // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          ga_size h_k = (h_im - h_col * stride_h);
-          ga_size w_k = (w_im - w_col * stride_w);
-          ga_size d_k = (d_im - d_col * stride_d);
-          if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
-            h_k /= dilation_h;
-            w_k /= dilation_w;
-            d_k /= dilation_d;
-            ga_size data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
-                                     h_k             * kernel_w * kernel_d * height_col * width_col * depth_col +
-                                     w_k                        * kernel_d * height_col * width_col * depth_col +
-                                     d_k                                   * height_col * width_col * depth_col +
-                                     h_col                                              * width_col * depth_col +
-                                     w_col                                                          * depth_col +
-                                     d_col;
-            val += data_col[data_col_index];
-          }
-        }
-      }
-    }
-    data_im[data_im_offset + index] = val;
-  }
-}
-#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
-#include "cluda.h"
-KERNEL void col2im3d_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
-    const ga_size offset_col,
-    const ga_size height, const ga_size width, const ga_size depth,
-    const ga_size channels,
-    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
-    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
-    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
-    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset) {
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    DTYPE_INPUT_0 val = 0;
-    const ga_size d_im = index % depth + pad_d;
-    const ga_size w_index = index / depth;
-    const ga_size w_im = w_index % width + pad_w;
-    const ga_size h_index = w_index / width;
-    const ga_size h_im = h_index % height + pad_h;
-    const ga_size c_im = h_index / height;
-    // compute the start and end of the output
-    const ga_size d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
-    const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
-    const ga_size w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
-    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
-    const ga_size h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
-    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
-    ga_size offset =
-      (c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
-       w_im * kernel_d + d_im) * height_col * width_col * depth_col;
-    ga_size coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
-    ga_size coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
-    ga_size coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
-    for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
-        }
-      }
-    }
-    data_im[data_im_offset + index] = val;
-  }
-}
-#section support_code
-int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
-          size_t M, size_t N, size_t K, double alpha,
-          GpuArray *A, size_t offA, size_t lda,
-          GpuArray *B, size_t offB, size_t ldb,
-          double beta, GpuArray *C, size_t offC, size_t ldc) {
-  switch (A->typecode) {
-  case GA_FLOAT:
-    return gpublas_sgemm(o, tA, tB,
-                         M, N, K, alpha,
-                         A->data, (A->offset / 4) + offA, lda,
-                         B->data, (B->offset / 4) + offB, ldb,
-                         beta,
-                         C->data, (C->offset / 4) + offC, ldc);
-  case GA_DOUBLE:
-    return gpublas_dgemm(o, tA, tB,
-                         M, N, K, alpha,
-                         A->data, (A->offset / 8) + offA, lda,
-                         B->data, (B->offset / 8) + offB, ldb,
-                         beta,
-                         C->data, (C->offset / 8) + offC, ldc);
-  case GA_HALF:
-    return gpublas_hgemm(o, tA, tB,
-                         M, N, K, alpha,
-                         A->data, (A->offset / 2) + offA, lda,
-                         B->data, (B->offset / 2) + offB, ldb,
-                         beta,
-                         C->data, (C->offset / 2) + offC, ldc);
-  default:
-    return GA_UNSUPPORTED_ERROR;
-  }
-}
-#section support_code_struct
-int im3d2col(
-    GpuArray *data_im, const size_t data_im_offset, const size_t channels,
-    const size_t height, const size_t width, const size_t depth,
-    const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
-    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
-    const size_t pad_h, const size_t pad_w, const size_t pad_d,
-    const size_t stride_h, const size_t stride_w, const size_t stride_d,
-    GpuArray *data_col) {
-  // We are going to launch channels * height_col * width_col * depth_col
-  // kernels, each kernel responsible for copying a single-channel grid.
-  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
-  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  size_t dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
-  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
-  size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
-  size_t num_kernels = channels * height_col * width_col * depth_col;
-  int err;
-  if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
-    err = dilated_im3d2col_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_im->data, data_im->offset,
-      data_im_offset, height, width, depth,
-      kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
-      pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
-      width_col, depth_col, data_col->data, data_col->offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: dilated_im3d2col_kernel: %s.",
-                     GpuKernel_error(&k_dilated_im3d2col_kernel, err));
-    }
-  } else {
-    err = im3d2col_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_im->data, data_im->offset,
-      data_im_offset, height, width, depth,
-      kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
-      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
-      data_col->data, data_col->offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: im3d2col_kernel: %s.",
-                     GpuKernel_error(&k_im3d2col_kernel, err));
-    }
-  }
-  return err;
-}
-int col2im3d(GpuArray *data_col, const size_t channels,
-    const size_t height, const size_t width, const size_t depth,
-    const size_t patch_h, const size_t patch_w, const size_t patch_d,
-    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
-    const size_t pad_h, const size_t pad_w, const size_t pad_d,
-    const size_t stride_h, const size_t stride_w, const size_t stride_d,
-    GpuArray *data_im, const size_t data_im_offset) {
-  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
-  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
-  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
-  size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
-  size_t num_kernels = channels * height * width * depth;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  int err;
-  if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
-    err = dilated_col2im3d_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_col->data, data_col->offset,
-      height, width, depth, channels, patch_h, patch_w,
-      patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
-      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
-      data_im->data, data_im->offset, data_im_offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: dilated_col2im3d_kernel: %s.",
-                     GpuKernel_error(&k_dilated_col2im3d_kernel, err));
-    }
-  }
-  else{
-    err = col2im3d_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_col->data, data_col->offset,
-      height, width, depth, channels, patch_h, patch_w,
-      patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
-      height_col, width_col, depth_col,
-      data_im->data, data_im->offset, data_im_offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: col2im3d_kernel: %s.",
-                     GpuKernel_error(&k_col2im3d_kernel, err));
-    }
-  }
-  return err;
-}
-// Aesara op code
-// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-// Adaptation for 3d
-PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
-                           PyGpuArrayObject *const weight,
-                           PyGpuArrayObject *const top,
-                           const size_t direction,
-                           const size_t dH = 1,
-                           const size_t dW = 1,
-                           const size_t dD = 1,
-                           const size_t dilH = 1,
-                           const size_t dilW = 1,
-                           const size_t dilD = 1,
-                           const size_t padH = 0,
-                           const size_t padW = 0,
-                           const size_t padD = 0,
-                           const size_t numgroups = 1)
-{
-    if (PyGpuArray_NDIM(bottom) != 5)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
-        return NULL;
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorr3dMM requires bottom to be C-contiguous, "
-                "but strides are: %ld %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(bottom)[0],
-                PyGpuArray_STRIDES(bottom)[1],
-                PyGpuArray_STRIDES(bottom)[2],
-                PyGpuArray_STRIDES(bottom)[3],
-                PyGpuArray_STRIDES(bottom)[4]);
-        return NULL;
-    }
-    if (PyGpuArray_NDIM(weight) != 5)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
-        return NULL;
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorr3dMM requires weight to be C-contiguous, "
-                "but strides are: %ld %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(weight)[0],
-                PyGpuArray_STRIDES(weight)[1],
-                PyGpuArray_STRIDES(weight)[2],
-                PyGpuArray_STRIDES(weight)[3],
-                PyGpuArray_STRIDES(weight)[4]);
-        return NULL;
-    }
-    if (PyGpuArray_NDIM(top) != 5)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
-        return NULL;
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorr3dMM requires top to be C-contiguous, "
-                "but strides are: %ld %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(top)[0],
-                PyGpuArray_STRIDES(top)[1],
-                PyGpuArray_STRIDES(top)[2],
-                PyGpuArray_STRIDES(top)[3],
-                PyGpuArray_STRIDES(top)[4]);
-        return NULL;
-    }
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
-    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
-    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
-    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
-    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
-    const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
-    // weights: (nFilters, nChannels, rows, columns, slices)
-    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
-    const size_t kH = PyGpuArray_DIMS(weight)[2];
-    const size_t kW = PyGpuArray_DIMS(weight)[3];
-    const size_t kD = PyGpuArray_DIMS(weight)[4];
-    if (nChannels != PyGpuArray_DIMS(weight)[1] * numgroups) {
-        PyErr_SetString(PyExc_ValueError,
-                "GpuCorr3dMM images and kernel must have the same stack size\n");
-        return NULL;
-    }
-    if ((nFilters % numgroups) != 0) {
-        PyErr_SetString(PyExc_ValueError,
-                "CorrMM the number of filters must be divisible by the number of groups\n");
-        return NULL;
-    }
-    // implicit dilated filter
-    const size_t dil_kH = (kH - 1) * dilH + 1;
-    const size_t dil_kW = (kW - 1) * dilW + 1;
-    const size_t dil_kD = (kD - 1) * dilD + 1;
-    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
-    const size_t topDepthNoDD  = (bottomDepth + 2*padD - dil_kD);
-    // the above values might be negative so we need to use Python-like
-    // flooring integer division to be compatible with get_conv_output.
-    // note: this macro implements Python's // for negative x only
-#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
-    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
-    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
-    const size_t topDepth  = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
-#undef _CONV_FLOORDIV
-    if (batchSize != PyGpuArray_DIMS(top)[0] ||
-            nFilters != PyGpuArray_DIMS(top)[1] ||
-            topHeight != PyGpuArray_DIMS(top)[2] ||
-            topWidth != PyGpuArray_DIMS(top)[3] ||
-            topDepth != PyGpuArray_DIMS(top)[4]) {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorr3dMM shape inconsistency:\n"
-                "  bottom shape: %ld %ld %ld %ld %ld\n"
-                "  weight shape: %ld %ld %ld %ld %ld\n"
-                "  top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
-                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
-                nFilters, nChannels / numgroups, kH, kW, kD,
-                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
-                batchSize, nFilters, topHeight, topWidth, topDepth);
-        return NULL;
-    }
-    int err = gpublas_setup(bottom->context->ctx);
-    if (err != GA_NO_ERROR) {
-        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
-        return NULL;
-    }
-    // Create temporary columns
-    size_t col_dim[2];
-    col_dim[0] = nChannels * kW * kH * kD;
-    col_dim[1] = topHeight * topWidth * topDepth;
-    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
-                                                           bottom->ga.typecode,
-                                                           GA_C_ORDER,
-                                                           bottom->context,
-                                                           Py_None);
-    if (NULL == col)
-    {
-        PyErr_Format(PyExc_RuntimeError,
-                "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
-                col_dim[0], col_dim[1]);
-        return NULL;
-    }
-    // Define some useful variables
-    const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
-    const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
-    const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
-    const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
-    const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
-    const size_t K_ = col_dim[0] / numgroups;
-    const size_t N_ = col_dim[1];
-    const size_t group_col_stride = (K_ * N_);
-    const size_t M_ = nFilters / numgroups;
-    PyGpuArrayObject *output;
-    if (direction == 0) {  // forward pass
-        output = top;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            err = GpuArray_memset(&output->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorr3dMM could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // valid correlation: im3d2col, then gemm
-        // Iterate over batch
-        for (size_t n = 0; n < batchSize; n++) {
-            // First, im3d2col
-            err = im3d2col(
-              &bottom->ga, n * batch_bottom_stride, nChannels, bottomHeight,
-              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
-              padH, padW, padD, dH, dW, dD, &col->ga);
-            if (err != GA_NO_ERROR) {
-                Py_DECREF(col);
-                return NULL;
-            }
-            for ( size_t g = 0; g < numgroups; ++g){
-                // Second, gemm
-                err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                            N_, M_, K_, 1,
-                            &col->ga, g * group_col_stride, N_,
-                            &weight->ga, g * group_weight_stride, K_,
-                            0,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_);
-            }
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorr3dMM forward encountered an error running gemm.");
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-    }
-    else if (direction == 1) {  // backprop wrt. weights
-        output = weight;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            err = GpuArray_memset(&output->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // valid convolution: im3col, then gemm
-        // Iterate over batch
-        for (size_t n = 0; n < batchSize; n++) {
-            // First, im3d2col
-            err = im3d2col(
-              &bottom->ga, n * batch_bottom_stride, nChannels, bottomHeight,
-              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
-              padH, padW, padD, dH, dW, dD, &col->ga);
-            if (err != GA_NO_ERROR) {
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            // Note that we accumulate into weight. We do so by setting beta = 0
-            // for the first iteration and beta = 1 for subsequent ones. (This
-            // is faster than setting weight to all zeros before the loop.)
-            for ( size_t g = 0; g < numgroups; ++g){
-                err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                            K_, M_, N_, 1,
-                            &col->ga, g * group_col_stride, N_,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                            (n == 0) ? 0 : 1,
-                            &weight->ga, g * group_weight_stride, K_);
-            }
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorr3dMM grad weights encountered an error running gemm.");
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-        if (batchSize == 0) {
-            err = GpuArray_memset(&weight->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorr3dMM grad weights could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-    }
-    else if (direction == 2) {  // backprop wrt. inputs
-        output = bottom;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            err = GpuArray_memset(&output->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // full convolution: gemm, then col2im3d
-        // Iterate over batch
-        for (size_t n = 0; n < batchSize; n++) {
-          // gemm into columns
-          for ( size_t g = 0; g < numgroups; ++g){
-              err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                          N_, K_, M_, 1,
-                          &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                          &weight->ga, g * group_weight_stride, K_,
-                          0,
-                          &col->ga, g * group_col_stride, N_);
-          }
-          if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuCorr3dMM grad inputs encountered an error running gemm.");
-            Py_DECREF(col);
-            return NULL;
-          }
-          // col2im3d back to the data
-          err = col2im3d(&col->ga, nChannels,
-                         bottomHeight, bottomWidth, bottomDepth,
-                         kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
-                         dH, dW, dD, &bottom->ga, n * batch_bottom_stride);
-          if (err != GA_NO_ERROR) {
-            Py_DECREF(col);
-            return NULL;
-          }
-        }
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-    // Note that we don't change the refcount of the output matrix here. Output
-    // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
--- a/aesara/gpuarray/c_code/corr_gemm.c
+++ b/aesara/gpuarray/c_code/corr_gemm.c
-#section kernels
-#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// TODO check kernel flags
-// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
-// sources are clearly marked. Below we reproduce the original license of
-// the Caffe software.
-/*
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
-// Kernels for fast unfold + copy
-// GPU kernel for the case of dilation
-KERNEL void dilated_im2col_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset,
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-    const ga_size height, const ga_size width,
-    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size dilation_h, const ga_size dilation_w,
-    const ga_size pad_hl, const ga_size pad_wl,
-    const ga_size stride_h, const ga_size stride_w,
-    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
-    const ga_size offset_col) {
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    const ga_size h_index = index / width_col;
-    const ga_size h_col = h_index % height_col;
-    const ga_size w_col = index % width_col;
-    const ga_size c_im = h_index / height_col;
-    const ga_size c_col = c_im * kernel_h * kernel_w;
-    const ga_size h_offset = h_col * stride_h - pad_hl;
-    const ga_size w_offset = w_col * stride_w - pad_wl;
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    for (ga_size i = 0; i < kernel_h; ++i) {
-      for (ga_size j = 0; j < kernel_w; ++j) {
-        ga_size h_im = h_offset + i * dilation_h;
-        ga_size w_im = w_offset + j * dilation_w;
-        *data_col_ptr =
-          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-KERNEL void im2col_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset,
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-    const ga_size height, const ga_size width,
-    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size pad_hl, const ga_size pad_wl,
-    const ga_size stride_h, const ga_size stride_w,
-    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
-    const ga_size offset_col) {
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    const ga_size h_index = index / width_col;
-    const ga_size h_col = h_index % height_col;
-    const ga_size w_col = index % width_col;
-    const ga_size c_im = h_index / height_col;
-    const ga_size c_col = c_im * kernel_h * kernel_w;
-    const ga_size h_offset = h_col * stride_h - pad_hl;
-    const ga_size w_offset = w_col * stride_w - pad_wl;
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    for (ga_size i = 0; i < kernel_h; ++i) {
-      for (ga_size j = 0; j < kernel_w; ++j) {
-        ga_size h_im = h_offset + i ;
-        ga_size w_im = w_offset + j ;
-        *data_col_ptr =
-          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-           data_im_ptr[i * width + j] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-// GPU kernel for the case of dilation
-#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
-#include "cluda.h"
-KERNEL void dilated_col2im_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
-    const ga_size height, const ga_size width, const ga_size channels,
-    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size dilation_h, const ga_size dilation_w,
-    const ga_size pad_hl, const ga_size pad_wl,
-    const ga_size stride_h, const ga_size stride_w,
-    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset) {
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    DTYPE_INPUT_0 val = 0;
-    const ga_size w_im = index % width + pad_wl;
-    const ga_size h_im = (index / width) % height + pad_hl;
-    const ga_size c_im = index / (width * height);
-    ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
-    // compute the start and end of the output
-    const ga_size w_col_start =
-        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
-    const ga_size h_col_start =
-        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
-    // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (ga_size h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-      for (ga_size w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-        ga_size h_k = (h_im - h_col * stride_h);
-        ga_size w_k = (w_im - w_col * stride_w);
-        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
-          h_k /= dilation_h;
-          w_k /= dilation_w;
-          ga_size data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
-                                height_col + h_col) * width_col + w_col;
-          val += data_col[data_col_index];
-        }
-      }
-    }
-    data_im[data_im_offset + index] = val;
-  }
-}
-#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
-#include "cluda.h"
-KERNEL void col2im_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
-    const ga_size height, const ga_size width, const ga_size channels,
-    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size pad_hl, const ga_size pad_wl,
-    const ga_size stride_h, const ga_size stride_w,
-    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
-    const ga_size offset_im,
-    const ga_size data_im_offset) {
-    // offset_im is the pointer offset for data_im.
-    // data_im_offset is an offset of elements in the array
-  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_col) + offset_col);
-  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)data_im) + offset_im);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < (n); index += LDIM_0 * GDIM_0) {
-    DTYPE_INPUT_0 val = 0;
-    const ga_size w_im = index % width + pad_wl;
-    const ga_size h_im = (index / width) % height + pad_hl;
-    const ga_size c_im = index / (width * height);
-    // compute the start and end of the output
-    const ga_size w_col_start =
-        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
-    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
-    const ga_size h_col_start =
-        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
-    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
-    // equivalent implementation, no dilation
-    ga_size offset =
-      (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
-    ga_size coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-    ga_size coeff_w_col = (1 - stride_w * height_col * width_col);
-    for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-      }
-    }
-    data_im[data_im_offset + index] = val;
-  }
-}
-#section support_code
-int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
-          size_t M, size_t N, size_t K, double alpha,
-          GpuArray *A, size_t offA, size_t lda,
-          GpuArray *B, size_t offB, size_t ldb,
-          double beta, GpuArray *C, size_t offC, size_t ldc) {
-  switch (A->typecode) {
-  case GA_FLOAT:
-    return gpublas_sgemm(o, tA, tB,
-                         M, N, K, alpha,
-                         A->data, (A->offset / 4) + offA, lda,
-                         B->data, (B->offset / 4) + offB, ldb,
-                         beta,
-                         C->data, (C->offset / 4) + offC, ldc);
-  case GA_DOUBLE:
-    return gpublas_dgemm(o, tA, tB,
-                         M, N, K, alpha,
-                         A->data, (A->offset / 8) + offA, lda,
-                         B->data, (B->offset / 8) + offB, ldb,
-                         beta,
-                         C->data, (C->offset / 8) + offC, ldc);
-  case GA_HALF:
-    return gpublas_hgemm(o, tA, tB,
-                         M, N, K, alpha,
-                         A->data, (A->offset / 2) + offA, lda,
-                         B->data, (B->offset / 2) + offB, ldb,
-                         beta,
-                         C->data, (C->offset / 2) + offC, ldc);
-  default:
-    return GA_UNSUPPORTED_ERROR;
-  }
-}
-#section support_code_struct
-int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
-    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
-    const size_t dilation_h, const size_t dilation_w,
-    const size_t pad_hl, const size_t pad_hr,
-    const size_t pad_wl, const size_t pad_wr,
-    const size_t stride_h, const size_t stride_w,
-    GpuArray *data_col) {
-  // We are going to launch channels * height_col * width_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
-  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  size_t height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
-  size_t width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
-  size_t num_kernels = channels * height_col * width_col;
-  int err;
-  if (dilation_h != 1 || dilation_w != 1) {
-    err = dilated_im2col_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_im->data, data_im->offset, data_im_offset,
-      height, width, kernel_h, kernel_w,
-      dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w, height_col,
-      width_col, data_col->data, data_col->offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: dilated_im2col_kernel: %s.",
-                     GpuKernel_error(&k_dilated_im2col_kernel, err));
-    }
-  } else {
-    err = im2col_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_im->data, data_im->offset, data_im_offset,
-      height, width, kernel_h, kernel_w,
-      pad_hl, pad_wl, stride_h, stride_w, height_col,
-      width_col, data_col->data, data_col->offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: im2col_kernel: %s.",
-                     GpuKernel_error(&k_im2col_kernel, err));
-    }
-  }
-  return err;
-}
-int col2im(GpuArray *data_col, const size_t channels,
-    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
-    const size_t dilation_h, const size_t dilation_w,
-    const size_t pad_hl, const size_t pad_hr, const size_t pad_wl, const size_t pad_wr,
-    const size_t stride_h, const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
-  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
-  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  size_t height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
-  size_t width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
-  size_t num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  int err;
-  if (dilation_h != 1 || dilation_w != 1) {
-    err = dilated_col2im_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_col->data, data_col->offset,
-      height, width, channels, patch_h, patch_w,
-      dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w,
-      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: dilated_col2im_kernel: %s.",
-                     GpuKernel_error(&k_dilated_col2im_kernel, err));
-    }
-  } else {
-    err = col2im_kernel_scall(
-      1, &num_kernels, 0,
-      num_kernels, data_col->data, data_col->offset,
-      height, width, channels, patch_h, patch_w,
-      pad_hl, pad_wl, stride_h, stride_w,
-      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
-    if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "gpuarray error: col2im_kernel: %s.",
-                     GpuKernel_error(&k_col2im_kernel, err));
-    }
-  }
-  return err;
-}
-// Aesara op code
-// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
-// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
-                         PyGpuArrayObject *const weight,
-                         PyGpuArrayObject *const top,
-                         const size_t direction,
-                         const size_t dH = 1,
-                         const size_t dW = 1,
-                         const size_t dilH = 1,
-                         const size_t dilW = 1,
-                         const size_t padH_l = 0,
-                         const size_t padH_r = 0,
-                         const size_t padW_l = 0,
-                         const size_t padW_r = 0,
-                         const size_t numgroups = 1,
-                         const size_t unshared = 0)
-{
-    if (PyGpuArray_NDIM(bottom) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
-        return NULL;
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires bottom to be C-contiguous, "
-                "but strides are: %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(bottom)[0],
-                PyGpuArray_STRIDES(bottom)[1],
-                PyGpuArray_STRIDES(bottom)[2],
-                PyGpuArray_STRIDES(bottom)[3]);
-        return NULL;
-    }
-    if (PyGpuArray_NDIM(weight) != (unshared ? 6 : 4))
-    {
-        PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight of %dD", unshared ? 6 : 4);
-        return NULL;
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
-    {
-        if (unshared) {
-            PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM requires weight to be C-contiguous, "
-                    "but strides are: %ld %ld %ld %ld %ld %ld\n",
-                    PyGpuArray_STRIDES(weight)[0],
-                    PyGpuArray_STRIDES(weight)[1],
-                    PyGpuArray_STRIDES(weight)[2],
-                    PyGpuArray_STRIDES(weight)[3],
-                    PyGpuArray_STRIDES(weight)[4],
-                    PyGpuArray_STRIDES(weight)[5]);
-            return NULL;
-        }
-        else {
-            PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM requires weight to be C-contiguous, "
-                    "but strides are: %ld %ld %ld %ld\n",
-                    PyGpuArray_STRIDES(weight)[0],
-                    PyGpuArray_STRIDES(weight)[1],
-                    PyGpuArray_STRIDES(weight)[2],
-                    PyGpuArray_STRIDES(weight)[3]);
-            return NULL;
-        }
-    }
-    if (PyGpuArray_NDIM(top) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
-        return NULL;
-    }
-    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
-    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires top to be C-contiguous, "
-                "but strides are: %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(top)[0],
-                PyGpuArray_STRIDES(top)[1],
-                PyGpuArray_STRIDES(top)[2],
-                PyGpuArray_STRIDES(top)[3]);
-        return NULL;
-    }
-    // Extract some shape information for later and check shape consistency
-    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
-    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
-    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
-    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
-    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
-    // weights: (nFilters, nChannels, rows, columns)
-    // or (nFilters, out_rows, out_columns, nChannels, rows, columns) -> for unshared
-    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
-    const size_t kH = PyGpuArray_DIMS(weight)[unshared ? 4 : 2];
-    const size_t kW = PyGpuArray_DIMS(weight)[unshared ? 5 : 3];
-    if (nChannels != PyGpuArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
-        PyErr_SetString(PyExc_ValueError,
-                "GpuCorrMM images and kernel must have the same stack size\n");
-        return NULL;
-    }
-    if ((nFilters % numgroups) != 0) {
-        PyErr_SetString(PyExc_ValueError,
-                "GPUCorrMM the number of filters must be divisible by the number of groups\n");
-        return NULL;
-    }
-    // implicit dilated filter
-    const size_t dil_kH = (kH - 1) * dilH + 1;
-    const size_t dil_kW = (kW - 1) * dilW + 1;
-    // top: (batchSize, nFilters, topHeight, topWidth)
-    const size_t topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
-    const size_t topWidthNoDW  = (bottomWidth + padW_l + padW_r - dil_kW);
-    // the above values might be negative so we need to use Python-like
-    // flooring integer division to be compatible with get_conv_output.
-    // note: this macro implements Python's // for negative x only
-#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
-    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
-    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
-#undef _CONV_FLOORDIV
-    if (unshared) {
-        if (topHeight != PyGpuArray_DIMS(weight)[1] ||
-                topWidth != PyGpuArray_DIMS(weight)[2]) {
-            PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM regions in kernel must match output regions:\n"
-                    "  bottom shape: %ld %ld %ld %ld\n"
-                    "  weight shape: %ld %ld %ld %ld %ld %ld"
-                    " (expected %ld %ld %ld %ld %ld %ld)\n"
-                    "  top shape(calculated): %ld %ld %ld %ld\n",
-                    batchSize, nChannels, bottomHeight, bottomWidth,
-                    nFilters, PyGpuArray_DIMS(weight)[1],
-                    PyGpuArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
-                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
-                    batchSize, nFilters, topHeight, topWidth);
-            return NULL;
-        }
-        if (batchSize != PyGpuArray_DIMS(top)[0] ||
-                nFilters != PyGpuArray_DIMS(top)[1] ||
-                topHeight != PyGpuArray_DIMS(top)[2] ||
-                topWidth != PyGpuArray_DIMS(top)[3]) {
-            PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM shape inconsistency:\n"
-                    "  bottom shape: %ld %ld %ld %ld\n"
-                    "  weight shape: %ld %ld %ld %ld %ld %ld\n"
-                    "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
-                    batchSize, nChannels, bottomHeight, bottomWidth,
-                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
-                    PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                    PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
-                    batchSize, nFilters, topHeight, topWidth);
-            return NULL;
-        }
-    }
-    else{
-        if (batchSize != PyGpuArray_DIMS(top)[0] ||
-                nFilters != PyGpuArray_DIMS(top)[1] ||
-                topHeight != PyGpuArray_DIMS(top)[2] ||
-                topWidth != PyGpuArray_DIMS(top)[3]) {
-            PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM shape inconsistency:\n"
-                    "  bottom shape: %ld %ld %ld %ld\n"
-                    "  weight shape: %ld %ld %ld %ld\n"
-                    "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
-                    batchSize, nChannels, bottomHeight, bottomWidth,
-                    nFilters, nChannels / numgroups, kH, kW,
-                    PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                    PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
-                    batchSize, nFilters, topHeight, topWidth);
-            return NULL;
-        }
-    }
-    int err = gpublas_setup(bottom->context->ctx);
-    if (err != GA_NO_ERROR) {
-        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
-        return NULL;
-    }
-    // Create temporary columns
-    size_t col_dim[2];
-    col_dim[0] = nChannels * kW * kH;
-    col_dim[1] = topHeight * topWidth;
-    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
-                                                           bottom->ga.typecode,
-                                                           GA_C_ORDER,
-                                                           bottom->context,
-                                                           Py_None);
-    if (NULL == col) {
-        PyErr_Format(PyExc_RuntimeError,
-                "GpuCorrMM failed to allocate working memory of %ld x %ld\n",
-                col_dim[0], col_dim[1]);
-        return NULL;
-    }
-    // Define some useful variables
-    const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
-    const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
-    const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
-    const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
-    const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
-    const size_t K_ = col_dim[0] / numgroups;
-    const size_t N_ = col_dim[1];
-    const size_t group_col_stride = (K_ * N_);
-    const size_t M_ = nFilters / numgroups;
-    PyGpuArrayObject *output;
-    if (direction == 0) {  // forward pass
-        output = top;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            err = GpuArray_memset(&output->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // valid correlation: im2col, then gemm
-        // Iterate over batch
-        for (size_t n = 0; n < batchSize; n++) {
-            // First, im2col
-            err = im2col(&bottom->ga, n * batch_bottom_stride,
-                         nChannels, bottomHeight,
-                         bottomWidth, kH, kW, dilH, dilW,
-                         padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
-            if (err != GA_NO_ERROR) {
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            if (unshared) {
-              for (size_t g = 0; g < numgroups; ++g) {
-                for (size_t reg = 0; reg < N_; ++reg){
-                  err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                      1, M_, K_, 1,
-                                      &col->ga, g * group_col_stride + reg, N_,
-                                      &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
-                                      0,
-                                      &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_);
-                  if (err != GA_NO_ERROR) {
-                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
-                      Py_DECREF(col);
-                      return NULL;
-                  }
-                }
-              }
-            }
-            else {
-              for (size_t g = 0; g < numgroups; ++g){
-                  err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                              N_, M_, K_, 1,
-                              &col->ga, g * group_col_stride, N_,
-                              &weight->ga, g * group_weight_stride, K_,
-                              0,
-                              &top->ga, n * batch_top_stride + g * group_top_stride, N_);
-                if (err != GA_NO_ERROR) {
-                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
-                    Py_DECREF(col);
-                    return NULL;
-                }
-              }
-            }
-        }
-    }
-    else if (direction == 1) {  // backprop wrt. weights
-        output = weight;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            err = GpuArray_memset(&output->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // valid convolution: im2col, then gemm
-        // Iterate over batch
-        for (size_t n = 0; n < batchSize; n++) {
-            // First, im2col
-            err = im2col(&bottom->ga, n * batch_bottom_stride,
-                         nChannels, bottomHeight,
-                         bottomWidth, kH, kW, dilH, dilW,
-                         padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
-            if (err != GA_NO_ERROR) {
-                Py_DECREF(col);
-                return NULL;
-            }
-            // Second, gemm
-            // Note that we accumulate into weight. We do so by setting beta = 0
-            // for the first iteration and beta = 1 for subsequent ones. (This
-            // is faster than setting weight to all zeros before the loop.)
-            if (unshared) {
-              for (size_t g = 0; g < numgroups; ++g) {
-                for (size_t reg = 0; reg < N_; ++reg){
-                  err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                              K_, M_, 1, 1,
-                              &col->ga, g * group_col_stride + reg, N_,
-                              &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
-                              (n == 0) ? 0 : 1,
-                              &weight->ga, g * group_weight_stride + reg * K_, K_ * N_);
-                  if (err != GA_NO_ERROR) {
-                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
-                      Py_DECREF(col);
-                      return NULL;
-                  }
-                }
-              }
-            }
-            else{
-              for(size_t g = 0; g < numgroups; g++){
-                  err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                              K_, M_, N_, 1,
-                              &col->ga, g * group_col_stride, N_,
-                              &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                              (n == 0) ? 0 : 1,
-                              &weight->ga, g * group_weight_stride, K_);
-                if (err != GA_NO_ERROR) {
-                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
-                    Py_DECREF(col);
-                    return NULL;
-                }
-              }
-            }
-        }
-    }
-    else if (direction == 2) {  // backprop wrt. inputs
-        output = bottom;
-        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
-            err = GpuArray_memset(&output->ga, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %d", err);
-                Py_DECREF(col);
-                return NULL;
-            }
-            Py_DECREF(col);
-            return output;
-        }
-        // full convolution: gemm, then col2im
-        // Iterate over batch
-        for (size_t n = 0; n < batchSize; n++) {
-            // gemm into columns
-            if (unshared) {
-              for (size_t g = 0; g < numgroups; ++g){
-                for (size_t reg = 0; reg < N_; ++reg) {
-                  err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                              1, K_, M_, 1,
-                              &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
-                              &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
-                              0,
-                              &col->ga, g * group_col_stride + reg, N_);
-                  if (err != GA_NO_ERROR) {
-                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
-                      Py_DECREF(col);
-                      return NULL;
-                  }
-                }
-              }
-            }
-            else {
-              for (size_t g = 0; g < numgroups; ++g){
-                err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                            N_, K_, M_, 1,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                            &weight->ga, g * group_weight_stride, K_,
-                            0,
-                            &col->ga, g * group_col_stride, N_);
-                if (err != GA_NO_ERROR) {
-                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
-                    Py_DECREF(col);
-                    return NULL;
-                }
-              }
-            }
-            // col2im back to the data
-            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
-                         kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
-                         dH, dW, &bottom->ga, n * batch_bottom_stride);
-            if (err != GA_NO_ERROR) {
-                Py_DECREF(col);
-                return NULL;
-            }
-        }
-    }
-    // Free temporary columns
-    Py_DECREF(col);
-    // Note that we don't change the refcount of the output matrix here. Output
-    // (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
-    // in here output is just aliased to one of bottom, weights, or top.
-    return output;
-}
--- a/aesara/gpuarray/c_code/ctc_wrapper.c
+++ b/aesara/gpuarray/c_code/ctc_wrapper.c
-#section init_code
-setup_ext_cuda();
-#section support_code
-typedef struct ctc_context {
-    struct ctcOptions options;
-    gpudata * workspace;
-    int * input_lengths;
-    int * flat_labels;
-    int * label_lengths;
-} ctc_context_t;
-void ctc_context_init(ctc_context_t * context, PyGpuContextObject * gpu_context)
-{
-    memset(&(context->options), 0, sizeof(struct ctcOptions));
-    context->options.loc = CTC_GPU;
-    // Get CUDA function pointer to obtain stream
-    CUstream (*getstream_func_ptr)(void *) = (CUstream (*)(void *)) gpuarray_get_extension( "cuda_get_stream" );
-    context->options.stream = getstream_func_ptr(gpu_context->ctx);
-    context->workspace = NULL;
-    context->input_lengths = NULL;
-    context->flat_labels = NULL;
-    context->label_lengths = NULL;
-}
-void ctc_context_destroy(ctc_context_t * context)
-{
-    gpudata_release( context->workspace );
-    free( context->input_lengths );
-    free( context->flat_labels );
-    free( context->label_lengths );
-}
-int ctc_check_result(ctcStatus_t retcode, const char * msg)
-{
-    if( CTC_STATUS_SUCCESS != retcode )
-    {
-        // Get error message from underlying library
-        const char * ctc_msg = ctcGetStatusString( retcode );
-        PyErr_Format( PyExc_RuntimeError,
-                      "GpuConnectionistTemporalClassification: %s CTC error: %s",
-                      msg,
-                      ctc_msg );
-        return 1;
-    }
-    return 0;
-}
-void create_contiguous_input_lengths( PyArrayObject * input_lengths_arr,
-    int ** input_lengths )
-{
-    npy_int num_elements = PyArray_DIMS( input_lengths_arr )[0];
-    *input_lengths = (int *) malloc( num_elements * sizeof(int) );
-    if ( NULL == (*input_lengths) )
-        return;
-    for( npy_int elem_idx = 0; elem_idx < num_elements; ++elem_idx )
-    {
-        (*input_lengths)[elem_idx] = *( (npy_int *) PyArray_GETPTR1( input_lengths_arr, elem_idx ) );
-    }
-}
-void create_flat_labels( PyArrayObject * label_matrix, int ** flat_labels,
-    int ** label_lengths )
-{
-    npy_int rows = PyArray_DIMS( label_matrix )[0];
-    npy_int cols = PyArray_DIMS( label_matrix )[1];
-    *flat_labels = (int *) calloc( rows * cols, sizeof(int) );
-    if ( NULL == (*flat_labels) )
-        return;
-    *label_lengths = (int *) calloc( rows, sizeof(int) );
-    if ( NULL == (*label_lengths) )
-    {
-        free( *flat_labels );
-        *flat_labels = NULL;
-        return;
-    }
-    npy_int label_index = 0;
-    for( npy_int row_idx = 0; row_idx < rows; ++row_idx )
-    {
-        npy_int label_length = 0;
-        for( npy_int col_idx = 0; col_idx < cols; ++col_idx )
-        {
-            npy_int label = *( (npy_int *) PyArray_GETPTR2( label_matrix, row_idx, col_idx ) );
-            if ( label >= 0 )  // negative values are assumed to be padding
-            {
-                (*flat_labels)[ label_index++ ] = label;
-                ++label_length;
-            }
-        }
-        (*label_lengths)[ row_idx ] = label_length;
-    }
-}
-#section support_code_apply
-int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject   *  in_activations,
-                                 PyArrayObject      *  in_labels,
-                                 PyArrayObject      *  in_input_lengths,
-                                 PyGpuArrayObject   ** out_costs,
-                                 PyGpuArrayObject   ** out_gradients,
-                                 PyGpuContextObject *  gpu_context)
-{
-    ctc_context_t ctc_object;
-    ctc_context_t * context = &ctc_object;
-    size_t gpu_workspace_size;
-    int ctc_error = 0;
-    const size_t num_activations = PyGpuArray_DIMS( in_activations )[0];
-    const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1];
-    const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2];
-    const size_t cost_size = minibatch_size;
-    const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size };
-    float * costs = NULL,
-          * activations = NULL,
-          * gradients = NULL;
-    cuda_enter( gpu_context->ctx );
-    ctc_context_init( context, gpu_context );
-    switch (in_activations->ga.typecode)
-    {
-    case GA_FLOAT:
-        activations = (float *) PyGpuArray_DEV_DATA( in_activations );
-        break;
-    default:
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        PyErr_SetString( PyExc_TypeError,
-            "GpuConnectionistTemporalClassification: Unsupported type for activations." );
-        return 1;
-    }
-    create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) );
-    if ( NULL == context->input_lengths )
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        PyErr_Format( PyExc_MemoryError,
-            "GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." );
-        return 1;
-    }
-    // flatten labels to conform with library memory layout
-    create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) );
-    if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) )
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        PyErr_Format( PyExc_MemoryError,
-            "GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." );
-        return 1;
-    }
-    if ( aesara_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode,
-                             GA_C_ORDER, gpu_context ) != 0 )
-    {
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        return 1;
-    }
-    GpuArray_memset( &((*out_costs)->ga), 0 );
-    costs = (float *) PyGpuArray_DEV_DATA( *out_costs );
-    if ( NULL != out_gradients )  // if gradient computation is not disabled
-    {
-        if ( aesara_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode,
-                                 GA_C_ORDER, gpu_context ) != 0 )
-        {
-            ctc_context_destroy( context );
-            cuda_exit( gpu_context->ctx );
-            return 1;
-        }
-        GpuArray_memset( &((*out_gradients)->ga), 0 );
-        gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients );
-    }
-    ctc_error = ctc_check_result( get_workspace_size( context->label_lengths,
-        context->input_lengths, alphabet_size, minibatch_size, context->options,
-        &gpu_workspace_size ),
-        "Failed to obtain CTC workspace size." );
-    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
-    {
-        // Destroy previous CTC context before returning exception
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        return 1;
-    }
-    context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL );
-    if ( NULL == context->workspace )
-    {
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        PyErr_Format( PyExc_MemoryError,
-            "GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." );
-        return 1;
-    }
-    cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    if ( out_gradients != NULL )
-        cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients,
-        context->flat_labels, context->label_lengths, context->input_lengths,
-        alphabet_size, minibatch_size, costs, *(void **)context->workspace,
-        context->options ), "Failed to compute CTC loss function." );
-    cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    if ( out_gradients != NULL )
-        cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    if ( ctc_error )  // Exception is set by ctc_check_result, return error here
-    {
-        ctc_context_destroy( context );
-        cuda_exit( gpu_context->ctx );
-        return 1;
-    }
-    ctc_context_destroy( context );
-    cuda_exit( gpu_context->ctx );
-    return 0;
-}
--- a/aesara/gpuarray/c_code/cudnn_helper.h
+++ b/aesara/gpuarray/c_code/cudnn_helper.h
-#ifndef CUDNN_HELPER_H
-#define CUDNN_HELPER_H
-#include <cudnn.h>
-#ifndef CUDNN_VERSION
-#define CUDNN_VERSION -1
-static inline int cudnnGetVersion() {
-  return -1;
-}
-#endif
-#if CUDNN_MAJOR < 7
-    enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
-#endif
-/* a common struct for all 3 CUDNN enums */
-struct AlgoRec {
-        int algo;
-        size_t wsSize;
-        cudnnMathType_t mathType;
-};
-#endif
--- a/aesara/gpuarray/c_code/dimshuffle.c
+++ b/aesara/gpuarray/c_code/dimshuffle.c
-#section support_code_apply
-int APPLY_SPECIFIC(gpu_dimshuffle)(PyGpuArrayObject* input, PyGpuArrayObject** out, PARAMS_TYPE* params) {
-    PyGpuArrayObject *tmp = NULL;
-    npy_intp nd_in = PyArray_SIZE(params->input_broadcastable);
-    npy_intp nd_out = PyArray_SIZE(params->_new_order);
-    npy_int64* new_order = NULL;
-    unsigned int* transposition = NULL;
-    size_t* sh = NULL;
-    int e;
-    if (input->ga.nd != nd_in) {
-        PyErr_SetString(PyExc_TypeError, "input nd");
-        return 1;
-    }
-    if (!PyArray_IS_C_CONTIGUOUS(params->_new_order)) {
-        PyErr_SetString(PyExc_RuntimeError, "DimShuffle: param _new_order must be C-contiguous.");
-        return 1;
-    }
-    if (!PyArray_IS_C_CONTIGUOUS(params->transposition)) {
-        PyErr_SetString(PyExc_RuntimeError, "GpuDimShuffle: param transposition must be C-contiguous.");
-        return 1;
-    }
-    Py_XDECREF(*out);
-    /** Do shuffle. **/
-    new_order = (npy_int64*) PyArray_DATA(params->_new_order);
-    /* Type of params->transposition (npy_uint32) should be an alias of unsigned int
-     * on platforms supported by Aesara. */
-    transposition = (unsigned int*) PyArray_DATA(params->transposition);
-    sh = (size_t*) malloc(nd_out * sizeof(size_t));
-    if (sh == NULL) {
-        PyErr_NoMemory();
-        return 1;
-    }
-    tmp = pygpu_transpose(input, transposition);
-    if (!tmp) {
-        free(sh);
-        return 1;
-    }
-    e = 0;
-    for (npy_intp i = 0; i < nd_out; ++i) {
-        if (new_order[i] == -1) {
-            sh[i] = 1;
-        } else {
-            sh[i] = tmp->ga.dimensions[e];
-            ++e;
-        }
-    }
-    *out = pygpu_reshape(tmp, nd_out, sh, GA_ANY_ORDER, 1, -1);
-    Py_DECREF(tmp);
-    free(sh);
-    if (*out == NULL) {
-        return 1;
-    }
-    /** End shuffle. **/
-    if (!params->inplace) {
-        tmp = pygpu_copy(*out, GA_ANY_ORDER);
-        Py_DECREF(*out);
-        if (!tmp) {
-            *out = NULL;
-            return 1;
-        }
-        *out = tmp;
-    }
-    return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_base.c
+++ b/aesara/gpuarray/c_code/dnn_base.c
-#section support_code
-static int
-c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) {
-  cudnnDataType_t dt;
-  size_t ds;
-  switch (var->ga.typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    dt = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
-    return -1;
-  }
-  ds = gpuarray_get_elsize(var->ga.typecode);
-  int strs[8], dims[8], default_stride = 1;
-  unsigned int nd = PyGpuArray_NDIM(var);
-  if (nd > 8) {
-    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
-    return -1;
-  }
-  for (unsigned int _i = nd; _i > 0; _i--) {
-    unsigned int i = _i - 1;
-    strs[i] = (PyGpuArray_DIM(var, i) != 1 && PyGpuArray_STRIDE(var, i)) ?
-      PyGpuArray_STRIDE(var, i)/ds : default_stride;
-    default_stride *= PyGpuArray_DIM(var, i);
-    dims[i] = PyGpuArray_DIM(var, i);
-  }
-  /* Tensors can't be smaller than 3d for cudnn so we pad the
-   * descriptor if they are */
-  for (unsigned int i = nd; i < 3; i++) {
-    strs[i] = 1;
-    dims[i] = 1;
-  }
-  //only for grouped convolution i.e when groups > 1
-  dims[1] = dims[1] / groups;
-  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
-                                                 dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set tensorNd descriptor: %s",
-		 cudnnGetErrorString(err));
-    return -1;
-  }
-  return 0;
-}
-static int
-c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
- return c_set_tensor_for_conv(var, desc, 1);
-}
-static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
-  cudnnStatus_t err;
-  err = cudnnCreateTensorDescriptor(desc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    return -1;
-  }
-  if (c_set_tensorNd(var, *desc) != 0) {
-    cudnnDestroyTensorDescriptor(*desc);
-    return -1;
-  }
-  return 0;
-}
-static int
-c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, size_t groups) {
-  cudnnDataType_t dt;
-  cudnnStatus_t err;
-  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
-    PyErr_SetString(PyExc_ValueError,
-		    "Only contiguous filters (kernels) are supported.");
-    return -1;
-  }
-  switch (var->ga.typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    dt = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
-    return -1;
-  }
-  int dims[8];
-  unsigned int nd = PyGpuArray_NDIM(var);
-  if (nd > 8) {
-    PyErr_SetString(PyExc_TypeError, "Tensor of more than 8d");
-    return -1;
-  }
-  for (unsigned int _i = nd; _i > 0; _i--) {
-    unsigned int i = _i - 1;
-    dims[i] = PyGpuArray_DIM(var, i);
-  }
-  /* Filters can't be less than 3d so we pad */
-  for (unsigned int i = nd; i < 3; i++)
-    dims[i] = 1;
-  dims[0] = dims[0] / groups;
-  if (nd < 3)
-    nd = 3;
-    err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set filter descriptor: %s.",
-		 cudnnGetErrorString(err));
-    return -1;
-  }
-  return 0;
-}
-static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
-  cudnnStatus_t err;
-  err = cudnnCreateFilterDescriptor(desc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    return -1;
-  }
-  if (c_set_filter(var, *desc, 1) != 0) {
-    cudnnDestroyFilterDescriptor(*desc);
-    return -1;
-  }
-  return 0;
-}
-#section init_code
-setup_ext_cuda();
--- a/aesara/gpuarray/c_code/dnn_batchnorm.c
+++ b/aesara/gpuarray/c_code/dnn_batchnorm.c
-#section support_code_struct
-int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
-                     PyGpuArrayObject *bias, npy_float64 epsilon,
-                     npy_float64 running_average_factor,
-                     PyGpuArrayObject *in_running_mean, // may be NULL
-                     PyGpuArrayObject *in_running_var, // may be NULL
-                     PyGpuArrayObject **outp,
-                     PyGpuArrayObject **x_mean,
-                     PyGpuArrayObject **x_invstd,
-                     PyGpuArrayObject **out_running_mean, // may be NULL
-                     PyGpuArrayObject **out_running_var, // may be NULL
-                     PARAMS_TYPE* params) {
-  /* Note: based on Python code, in_running_mean, in_running_var, out_running_mean and out_running_var
-  are together NULL (or not NULL) at same time, so we just need to check only one of them. */
-  bool running_averages = (in_running_mean != NULL);
-  PyGpuContextObject *c = inp->context;
-  if (c_set_tensorNd(inp, bn_input) != 0)
-    return 1;
-  if (c_set_tensorNd(scale, bn_params) != 0)
-    return 1;
-  if (epsilon < 1e-5) {
-    PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
-    return 1;
-  }
-  if (params->inplace_output) {
-    Py_XDECREF(*outp);
-    *outp = inp;
-    Py_INCREF(*outp);
-  } else if (aesara_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) {
-    return 1;
-  }
-  if (aesara_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (aesara_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (c_set_tensorNd(*outp, bn_output) != 0)
-    return 1;
-  PyGpuArrayObject *running_mean = NULL;
-  PyGpuArrayObject *running_var = NULL;
-  if (running_averages) {
-    if (params->inplace_running_mean) {
-      Py_XDECREF(*out_running_mean);
-      running_mean = in_running_mean;
-      Py_INCREF(running_mean);
-    } else {
-      running_mean = *out_running_mean;
-      running_mean = aesara_try_copy(running_mean, in_running_mean);
-      if (running_mean == NULL) {
-        return 1;
-      }
-    }
-    if (params->inplace_running_var) {
-      Py_XDECREF(*out_running_var);
-      running_var = in_running_var;
-      Py_INCREF(running_var);
-    } else {
-      running_var = *out_running_var;
-      running_var = aesara_try_copy(running_var, in_running_var);
-      if (running_var == NULL) {
-        return 1;
-      }
-    }
-  }
-  {
-    const float falpha = 1.;
-    const float fbeta = 0.;
-    const double dalpha = 1.;
-    const double dbeta = 0.;
-    void *alpha;
-    void *beta;
-    if (inp->ga.typecode == GA_DOUBLE) {
-      alpha = (void *)&dalpha;
-      beta = (void *)&dbeta;
-    } else {
-      alpha = (void *)&falpha;
-      beta = (void *)&fbeta;
-    }
-    cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
-      params->handle,
-      params->mode,
-      alpha,
-      beta,
-      bn_input,
-      PyGpuArray_DEV_DATA(inp),
-      bn_output,
-      PyGpuArray_DEV_DATA(*outp),
-      bn_params,
-      PyGpuArray_DEV_DATA(scale),
-      PyGpuArray_DEV_DATA(bias),
-      running_averages ? running_average_factor : 0,
-      running_averages ? PyGpuArray_DEV_DATA(running_mean) : NULL,
-      running_averages ? PyGpuArray_DEV_DATA(running_var): NULL,
-      epsilon,
-      PyGpuArray_DEV_DATA(*x_mean),
-      PyGpuArray_DEV_DATA(*x_invstd)
-      );
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    if (running_averages) {
-      *out_running_mean = running_mean;
-      *out_running_var = running_var;
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_batchnorm_base.c
+++ b/aesara/gpuarray/c_code/dnn_batchnorm_base.c
-#section init_code_struct
-{
-  cudnnStatus_t err;
-  bn_input = NULL;
-  bn_params = NULL;
-  bn_output = NULL;
-  if ((err = cudnnCreateTensorDescriptor(&bn_input)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-                 "(bn_input): %s", cudnnGetErrorString(err));
-    FAIL;
-  }
-  if ((err = cudnnCreateTensorDescriptor(&bn_params)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-                 "(bn_params): %s", cudnnGetErrorString(err));
-    FAIL;
-  }
-  if ((err = cudnnCreateTensorDescriptor(&bn_output)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-                 "(bn_output): %s", cudnnGetErrorString(err));
-    FAIL;
-  }
-}
-#section cleanup_code_struct
-if (bn_input != NULL)
-  cudnnDestroyTensorDescriptor(bn_input);
-if (bn_params != NULL)
-  cudnnDestroyTensorDescriptor(bn_params);
-if (bn_output != NULL)
-  cudnnDestroyTensorDescriptor(bn_output);
-#section support_code_struct
-cudnnTensorDescriptor_t bn_input;
-cudnnTensorDescriptor_t bn_params;
-cudnnTensorDescriptor_t bn_output;
--- a/aesara/gpuarray/c_code/dnn_batchnorm_grad.c
+++ b/aesara/gpuarray/c_code/dnn_batchnorm_grad.c
-#section init_code_struct
-{
-  cudnnStatus_t err;
-  bn_doutput = NULL;
-  if ((err = cudnnCreateTensorDescriptor(&bn_doutput)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-                 "(bn_doutput): %s", cudnnGetErrorString(err));
-    FAIL;
-  }
-}
-#section cleanup_code_struct
-if (bn_doutput != NULL)
-  cudnnDestroyTensorDescriptor(bn_doutput);
-#section support_code_struct
-cudnnTensorDescriptor_t bn_doutput;
-int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
-                       PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
-                       PyGpuArrayObject *x_invstd, npy_float64 epsilon,
-                       PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
-                       PyGpuArrayObject **dbias, PARAMS_TYPE* params) {
-  PyGpuContextObject *c = inp->context;
-  if (c_set_tensorNd(inp, bn_input) != 0)
-    return 1;
-  if (c_set_tensorNd(doutp, bn_doutput) != 0)
-    return 1;
-  if (c_set_tensorNd(scale, bn_params) != 0)
-    return 1;
-  if (epsilon < 1e-5) {
-    PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
-    return 1;
-  }
-  if (aesara_prep_output(dinp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (aesara_prep_output(dscale, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (aesara_prep_output(dbias, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
-    return 1;
-  if (c_set_tensorNd(*dinp, bn_output) != 0)
-    return 1;
-  {
-    const float falpha = 1.;
-    const float fbeta = 0.;
-    const double dalpha = 1.;
-    const double dbeta = 0.;
-    void *alphaData;
-    void *betaData;
-    void *alphaParam;
-    void *betaParam;
-    if (inp->ga.typecode == GA_DOUBLE) {
-      alphaData = (void *)&dalpha;
-      betaData = (void *)&dbeta;
-      alphaParam = (void *)&dalpha;
-      betaParam = (void *)&dbeta;
-    } else {
-      alphaData = (void *)&falpha;
-      betaData = (void *)&fbeta;
-      alphaParam = (void *)&falpha;
-      betaParam = (void *)&fbeta;
-    }
-    cudnnStatus_t err = cudnnBatchNormalizationBackward(
-      params->handle,
-      params->mode,
-      alphaData,
-      betaData,
-      alphaParam,
-      betaParam,
-      bn_input,
-      PyGpuArray_DEV_DATA(inp),
-      bn_doutput,
-      PyGpuArray_DEV_DATA(doutp),
-      bn_output,
-      PyGpuArray_DEV_DATA(*dinp),
-      bn_params,
-      PyGpuArray_DEV_DATA(scale),
-      PyGpuArray_DEV_DATA(*dscale),
-      PyGpuArray_DEV_DATA(*dbias),
-      epsilon,
-      PyGpuArray_DEV_DATA(x_mean),
-      PyGpuArray_DEV_DATA(x_invstd)
-      );
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_batchnorm_inf.c
+++ b/aesara/gpuarray/c_code/dnn_batchnorm_inf.c
-#section support_code_struct
-int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
-                     PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
-                     PyGpuArrayObject *est_var, npy_float64 epsilon,
-                     PyGpuArrayObject **outp, PARAMS_TYPE* params) {
-  PyGpuContextObject *c = inp->context;
-  if (c_set_tensorNd(inp, bn_input) != 0)
-    return 1;
-  if (c_set_tensorNd(scale, bn_params) != 0)
-    return 1;
-  if (epsilon < 1e-5) {
-    PyErr_Format(PyExc_ValueError, "epsilon must be at least 1e-5, got %f", epsilon);
-    return 1;
-  }
-  if (params->inplace) {
-    Py_XDECREF(*outp);
-    *outp = inp;
-    Py_INCREF(*outp);
-  } else {
-    if (aesara_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
-      return 1;
-  }
-  if (c_set_tensorNd(*outp, bn_output) != 0)
-    return 1;
-  {
-    const float falpha = 1.;
-    const float fbeta = 0.;
-    const double dalpha = 1.;
-    const double dbeta = 0.;
-    void *alpha;
-    void *beta;
-    if (inp->ga.typecode == GA_DOUBLE) {
-      alpha = (void *)&dalpha;
-      beta = (void *)&dbeta;
-    } else {
-      alpha = (void *)&falpha;
-      beta = (void *)&fbeta;
-    }
-    cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
-      params->handle,
-      params->mode,
-      alpha,
-      beta,
-      bn_input,
-      PyGpuArray_DEV_DATA(inp),
-      bn_output,
-      PyGpuArray_DEV_DATA(*outp),
-      bn_params,
-      PyGpuArray_DEV_DATA(scale),
-      PyGpuArray_DEV_DATA(bias),
-      PyGpuArray_DEV_DATA(est_mean),
-      PyGpuArray_DEV_DATA(est_var),
-      epsilon
-      );
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_conv_base.c
+++ b/aesara/gpuarray/c_code/dnn_conv_base.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
-cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
-static int c_get_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
-#if CUDNN_MAJOR >= 7
-  int desc_groups;
-  if (groups > 1) {
-    cudnnStatus_t err = cudnnGetConvolutionGroupCount(desc, &desc_groups);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-		   "error getting groups for convolution : %s",
-		   cudnnGetErrorString(err));
-      return -1;
-    }
-    if (groups != desc_groups) {
-      PyErr_SetString(PyExc_MemoryError,
-              "groups specified different from convolution descriptor");
-      return -1;
-    }
-  }
-  return 1;
-#else
-  return groups;
-#endif
-}
-static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
-#if CUDNN_MAJOR >= 7
-  // CUDNN7: need to set math type
-  cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "error setting math type for convolution : %s",
-                 cudnnGetErrorString(err));
-    return -1;
-  }
-#endif
-  return 0;
-}
-#section init_code_struct
-cudnnStatus_t APPLY_SPECIFIC(err);
-APPLY_SPECIFIC(input) = NULL;
-APPLY_SPECIFIC(output) = NULL;
-APPLY_SPECIFIC(kerns) = NULL;
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
-	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(input) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
-if (APPLY_SPECIFIC(output) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
-if (APPLY_SPECIFIC(kerns) != NULL)
-  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
-#section support_code
-#include <sstream>
-#include <string>
-#if __cplusplus < 201103L && !defined(__APPLE__)
-#include <tr1/unordered_map>
-typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
-#else
-#include <unordered_map>
-typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
-#endif
-#include "pthread.h"
-#line 87 "dnn_conv_base.c"
-#ifdef DEBUG
-#if __cplusplus < 201103L
-const char* const _cppver = "No timing available: C++11 or later is required.";
-#else
-#define DEBUG_TIMING
-#include <chrono>
-const char* const _cppver = NULL;
-struct AesaraTimer {
-    double milliseconds;
-    std::chrono::steady_clock::time_point base;
-    void start() {base = std::chrono::steady_clock::now();}
-    void end() {
-        milliseconds =
-            std::chrono::duration_cast<std::chrono::nanoseconds>(
-                std::chrono::steady_clock::now() - base
-            ).count() / 1000000.0;
-    }
-};
-#endif
-#endif
-pthread_mutex_t  algoMutex;
-AlgoCache        algoCache;
-static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
-{
-    if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
-                     msg, cudnnGetErrorString(err));
-    }
-    return err;
-}
-static size_t
-c_get_largest_free_block_size(PyGpuContextObject *c)
-{
-  size_t maxfree = 0;
-  int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
-  if (err2 != GA_NO_ERROR) {
-    PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                 "memory information on the GPU");
-  }
-  // Guess 4Mb if the info is not available
-  if (maxfree == 0) maxfree = 4 * 1024 * 1024;
-  return maxfree;
-}
-/** Check if convolution output tensor has expected dimensions
-    depending on given inputs and number of groups.
-    return 0 if everything is ok, non-0 on error.
-**/
-static int dnn_check_convolution_output(cudnnConvolutionDescriptor_t convDesc,
-                                        cudnnTensorDescriptor_t inputDesc,
-                                        cudnnFilterDescriptor_t filterDesc,
-                                        size_t tensorNdim,
-                                        PyGpuArrayObject* output,
-                                        int groups) {
-    int expected_output_dims[5] = {0};
-    cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
-                                                              tensorNdim, expected_output_dims);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    if (tensorNdim == 4) {
-      if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-          (PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
-          (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-          (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-        PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
-                     " but received %ldx%ldx%ldx%ld",
-                     expected_output_dims[0], expected_output_dims[1] * groups,
-                     expected_output_dims[2], expected_output_dims[3],
-                     PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                     PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-        return 1;
-      }
-    } else if (tensorNdim == 5) {
-      if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-          (PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
-          (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-          (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-          (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-        PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
-                     " but received %ldx%ldx%ldx%ldx%ld",
-                     expected_output_dims[0], expected_output_dims[1] * groups,
-                     expected_output_dims[2], expected_output_dims[3],
-                     expected_output_dims[4],
-                     PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                     PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                     PyGpuArray_DIMS(output)[4]);
-        return 1;
-      }
-    }
-    return 0;
-}
-static std::string shape(int* res, int size)
-{
-    std::ostringstream s;
-    if (size > 0) {
-      s << res[0];
-      for (int i = 1; i < size; ++i)
-        s <<',' << res[i];
-    }
-    return s.str();
-}
-static std::string shape(cudnnTensorDescriptor_t t)
-{
-    // cuDNN can handle up to CUDNN_DIM_MAX dimensions.
-    int res[CUDNN_DIM_MAX];
-    int stride[CUDNN_DIM_MAX];
-    int nbDims;
-    cudnnDataType_t type;
-    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
-                     "error getting tensor description");
-    if (PyErr_Occurred()) return "";
-    return shape(res, nbDims) + "," + shape(stride, nbDims);
-};
-static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
-{
-    cudnnTensorFormat_t format;
-    int res[CUDNN_DIM_MAX];
-    int outDims;
-    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
-                     "error getting filter description");
-    if (PyErr_Occurred()) return "";
-    return shape(res, outDims);
-};
-static std::string shape(cudnnConvolutionDescriptor_t convDesc, int dataTypecode)
-{
-    int nDim;
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t        computeType;
-    int                                 padA[5];
-    int                                 strideA[5];
-    int                                 dilationA[5];
-    /* Data type configuration. Format: " -<dtype><precision>" with dtype and precision in {h, f, d},
-     * h for half (float16), f for float (float32), d for double (float64). */
-    char data_type_configuration[5];
-    checkCudnnStatus(
-        cudnnGetConvolutionNdDescriptor( convDesc, 5,
-                                         &nDim,
-                                         &padA[0],
-                                         &strideA[0],
-                                         &dilationA[0],
-                                         &mode,
-                                         &computeType ),
-        "error getting convolution description");
-    if (PyErr_Occurred()) return "";
-    /* Build data type configuration string. */
-    data_type_configuration[0] = ' ';
-    data_type_configuration[1] = '-';
-    switch (dataTypecode) {
-        case GA_HALF: data_type_configuration[2] = 'h'; break;
-        case GA_FLOAT: data_type_configuration[2] = 'f'; break;
-        case GA_DOUBLE: data_type_configuration[2] = 'd'; break;
-        default:
-            PyErr_SetString(PyExc_TypeError, "Unsupported data type in convolution.");
-            return "";
-    }
-    switch (computeType) {
-        case CUDNN_DATA_HALF: data_type_configuration[3] = 'h'; break;
-        case CUDNN_DATA_FLOAT: data_type_configuration[3] = 'f'; break;
-        case CUDNN_DATA_DOUBLE: data_type_configuration[3] = 'd'; break;
-        default:
-            PyErr_SetString(PyExc_TypeError, "Unsupported precision in convolution.");
-            return "";
-    }
-    data_type_configuration[4] = '\0';
-    return (std::string("-mode ") +
-            ((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
-            " -pad " +
-            shape(padA, nDim) +
-            " -subsample " +
-            shape(strideA, nDim) +
-            " -dilation " +
-            shape(dilationA, nDim) +
-            data_type_configuration);
-}
-static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
-{
-        size_t alignMask = 0xF;
-        // there have to be entries for both aligned and not
-        if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
-        {
-            return false;
-        }
-        return true;
-}
-static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
-				  cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
-				  cudnnConvolutionDescriptor_t convDesc,
-				  PyGpuArrayObject* output, int groups)
-{
-    cudnnDataType_t  dType;
-    std::ostringstream s;
-    int expected_output_dims[5] = {0};
-    if (dnn_check_convolution_output(convDesc, inputDesc, filterDesc, PyGpuArray_NDIM(filter), output, groups) != 0)
-        return "";
-    std::string shapeInput = shape(inputDesc);
-    std::string shapeFilter = shape(filterDesc, &dType);
-    std::string shapeConvDesc = shape(convDesc, input->ga.typecode);
-    if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
-        return "";
-    s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
-      shapeFilter << " " << shapeConvDesc;
-    // there have to be entries for both aligned and not.
-    if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
-    {
-      s << " [unaligned]";
-    }
-    return s.str();
-}
-static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
-{
-  pthread_mutex_lock(&algoMutex);
-  algoCache[hash] = rec;
-  pthread_mutex_unlock(&algoMutex);
-}
-static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
-{
-  pthread_mutex_lock(&algoMutex);
-  const AlgoRec* ret = 0;
-  AlgoCache::iterator hit = algoCache.find(hash);
-  if (hit != algoCache.end())
-    ret = &hit->second;
-  pthread_mutex_unlock(&algoMutex);
-  return ret;
-}
--- a/aesara/gpuarray/c_code/dnn_dropout_desc.c
+++ b/aesara/gpuarray/c_code/dnn_dropout_desc.c
-#section support_code
-int dnn_dropout_desc(float dropout, unsigned long long seed,
-                     PyGpuContextObject *c,
-                     cudnnDropoutDescriptor_t *odesc,
-                     PyGpuArrayObject **ostates,
-                     cudnnHandle_t _handle) {
-  PyGpuArrayObject *states;
-  cudnnDropoutDescriptor_t desc;
-  size_t states_sz;
-  cudnnStatus_t err;
-  cuda_enter(c->ctx);
-  err = cudnnCreateDropoutDescriptor(&desc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
-    cuda_exit(c->ctx);
-    return -1;
-  }
-  /* Can't fail according to docs */
-  cudnnDropoutGetStatesSize(_handle, &states_sz);
-  states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
-  if (states == NULL) {
-    cudnnDestroyDropoutDescriptor(desc);
-    cuda_exit(c->ctx);
-    return -1;
-  }
-  err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
-                                  PyGpuArray_DEV_DATA(states),
-                                  states_sz, seed);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
-    Py_DECREF((PyObject *)states);
-    cudnnDestroyDropoutDescriptor(desc);
-    cuda_exit(c->ctx);
-    return -1;
-  }
-  cuda_exit(c->ctx);
-  *odesc = desc;
-  *ostates = states;
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_dropout_fwd.c
+++ b/aesara/gpuarray/c_code/dnn_dropout_fwd.c
-#section support_code
-int dnn_dropout_fwd(PyGpuArrayObject *x,
-                    cudnnDropoutDescriptor_t *desc,
-                    PyGpuArrayObject *state,
-                    PyGpuArrayObject **y,
-                    PyGpuArrayObject **ostate,
-                    gpudata **reserve,
-                    cudnnHandle_t _handle) {
-  PyGpuArrayContext *c = x->context;
-  cudnnTensorDescriptor_t xdesc;
-  cudnnTensorDescriptor_t ydesc;
-  gpudata *res;
-  size_t res_sz;
-  cudnnStatus_t err;
-  if (c_make_tensorNd(x, &xdesc))
-    return -1;
-  if (aesara_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
-                         GA_C_ORDER, c)) {
-    cudnnDestroyTensorDescriptor(xdesc);
-    return -1;
-  }
-  if (c_make_tensorNd(y, &ydesc)) {
-    cudnnDestroyTensorDescriptor(xdesc);
-    return -1;
-  }
-  *ostate = state;
-  Py_INCREF((PyObject *)state);
-  /* This can't fail according to the docs */
-  err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
-  res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
-  if (res == NULL) {
-    cudnnDestroyTensorDescriptor(xdesc);
-    cudnnDestroyTensorDescriptor(ydesc);
-    PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
-  }
-  *reserve = res;
-  cuda_enter(c->ctx);
-  err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
-                            ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
-                            res_sz);
-  cudnnDestroyTensorDescriptor(xdesc);
-  cudnnDestroyTensorDescriptor(ydesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not run dropout: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return -1;
-  }
-  cuda_exit(c->ctx);
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_fwd.c
+++ b/aesara/gpuarray/c_code/dnn_fwd.c
-#section init_code_struct
-prev_algo.algo = PARAMS->conv_algo;
-prev_algo.mathType = CUDNN_DEFAULT_MATH;
-reuse_algo = 0;
-hash_prefix = std::string("FWD|GPU#");
-#ifdef DEBUG_TIMING
-total_computation_time = 0;
-total_selection_time = 0;
-n_computations = 0;
-n_selections = 0;
-if (PARAMS->choose_algo) {
-    if (PARAMS->choose_time) {
-        selection_name = "fastest";
-    } else {
-        selection_name = "best suited";
-    }
-};
-#endif
-#section support_code_struct
-#line 22 "dnn_fwd.c"
-int     reuse_algo;
-AlgoRec prev_algo;
-std::string hash_prefix;
-#define AESARA_DONT_MEMSET_STRUCT
-#ifdef DEBUG
-char algorithm_name[128];
-#endif
-#ifdef DEBUG_TIMING
-double total_computation_time;
-double total_selection_time;
-size_t n_computations;
-size_t n_selections;
-const char* selection_name;
-#endif
-/** Check given algorithm against inputs and convolution descriptor,
-    change algorithm inplace to a fallback algorithm if checkings fail.
-    Return 0 on success, non-0 on error. **/
-int dnn_conv_fwd_fallback(cudnnConvolutionFwdAlgo_t* _algo,
-                          const PyGpuArrayObject* input,
-                          const PyGpuArrayObject* kerns,
-                          cudnnConvolutionDescriptor_t desc) {
-  cudnnConvolutionFwdAlgo_t algo = *_algo;
-  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
-  if (PyGpuArray_NDIM(input) == 5 &&
-      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
-        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
-        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
-  {
-    #ifdef DEBUG
-    if (0 != aesara_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
-        return 1;
-    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
-    #endif
-    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  }
-  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
-  // Issue should be resolved for cuDNN > V6.0.
-  // NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
-  if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
-      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
-      PyGpuArray_DIM(input, 0) > 65536)
-  {
-    #ifdef DEBUG
-    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
-                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
-    #endif
-    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  }
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  /* NB:
-  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
-  New conditions apply and may depend on number of dimensions (2D or 3D)
-  e.g. for FFT_TILING.
-  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
-  */
-  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int dilation[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-        #ifdef DEBUG
-        fprintf(stderr, "(replacing fwd algo fft with none)\n");
-        #endif
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-        #ifdef DEBUG
-        fprintf(stderr, "(replacing fwd algo fft_tiling with none)\n");
-        #endif
-      }
-    }
-  }
-  *_algo = algo;
-  return 0;
-}
-int
-APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
-                         PyGpuArrayObject *om,
-                         cudnnConvolutionDescriptor_t desc,
-                         double alpha, double beta,
-                         PyGpuArrayObject **output,
-                         PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  void *alpha_p;
-  void *beta_p;
-  float af = alpha, bf = beta;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  bool use_cached = 0;
-  #ifdef DEBUG
-  if (_cppver) fprintf(stderr, "%s\n", _cppver);
-  #endif
-  #ifdef DEBUG_TIMING
-  AesaraTimer timer;
-  #endif
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
-    PyErr_SetString(PyExc_ValueError,
-		    "images and kernel must have the same stack size");
-    return 1;
-  }
-  if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) {
-    PyErr_SetString(PyExc_ValueError,
-		    "Number of filters must be divisible by number of groups");
-    return 1;
-  }
-  switch (input->ga.typecode) {
-  case GA_DOUBLE:
-    alpha_p = (void *)&alpha;
-    beta_p = (void *)&beta;
-    break;
-  case GA_FLOAT:
-  case GA_HALF:
-    alpha_p = (void *)&af;
-    beta_p = (void *)&bf;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
-    return 1;
-  }
-  if (params->inplace) {
-    Py_XDECREF(*output);
-    *output = om;
-    Py_INCREF(*output);
-  } else {
-    if (aesara_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
-                           om->ga.typecode, GA_C_ORDER, c) != 0)
-      return 1;
-    if (beta != 0.0 && pygpu_move(*output, om))
-      return 1;
-  }
-  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
-    int err2 = GpuArray_memset(&(*output)->ga, 0);
-    if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuDnnConv could not fill the output with zeros: %d", err2);
-        return 1;
-    }
-    return 0;
-  }
-  int groups = c_get_groups_for_conv(desc, params->num_groups);
-  if (groups == -1)
-    return 1;
-  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
-    return 1;
-  if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), groups) == -1)
-    return 1;
-  size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
-  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
-  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;
-  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
-  size_t worksize = 0;
-  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
-  std::string hashkey;
-  cuda_enter(c->ctx);
-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) {
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (params->choose_algo) {
-    if (!reuse_algo) {
-      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
-      // check out cache
-      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
-      if (hashkey.empty()) {
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
-      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
-      if (cached) {
-        prev_algo = *cached;
-        use_cached = 1;
-      }
-    }
-    if (reuse_algo || use_cached) {
-      algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
-      worksize = prev_algo.wsSize;
-      mathtype = prev_algo.mathType;
-    }  else {
-      if (params->choose_time) {
-        int count;
-        cudnnConvolutionFwdAlgoPerf_t choice;
-        gpudata *tmpmem;
-        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
-        if (tmpmem == NULL) {
-          PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
-          cuda_exit(c->ctx);
-          return -1;
-        }
-        // set the 'tensor math ok' flag
-        if (input->ga.typecode == GA_HALF)
-          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
-        /* cudnnFindConvolutionForwardAlgorithmEx() may write to output.
-           We don't want that if output is used in computation (ie. if beta != 0). */
-        PyGpuArrayObject* o = *output;
-        if (beta != 0) {
-            o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None);
-        }
-        #ifdef DEBUG_TIMING
-        timer.start();
-        #endif
-        // We don't sync the buffer as we don't care about the values.
-        err = cudnnFindConvolutionForwardAlgorithmEx(
-          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
-          1, &count, &choice, *(void **)tmpmem,
-          maxfree);
-        #ifdef DEBUG_TIMING
-        timer.end();
-        #endif
-        gpudata_release(tmpmem);
-        if (beta != 0) {
-            Py_XDECREF(o);
-        }
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-        #ifdef DEBUG
-        if (count == 0) {
-            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
-            cuda_exit(c->ctx);
-            return 1;
-        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "error getting best-timed FWD algo: %s",
-                         cudnnGetErrorString(choice.status));
-            cuda_exit(c->ctx);
-            return 1;
-        } // Else, count is necessarly 1 for current implementation.
-        #endif
-        algo = choice.algo;
-        worksize = choice.memory;
-#if CUDNN_MAJOR >= 7
-        if (input->ga.typecode == GA_HALF)
-          mathtype = choice.mathType;
-#endif
-      } else {
-        #ifdef DEBUG_TIMING
-        timer.start();
-        #endif
-        err = cudnnGetConvolutionForwardAlgorithm(
-          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-          desc, APPLY_SPECIFIC(output),
-          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
-        #ifdef DEBUG_TIMING
-        timer.end();
-        #endif
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-      }
-      #ifdef DEBUG_TIMING
-      total_selection_time += timer.milliseconds;
-      ++n_selections;
-      #endif
-    }
-  }
-  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
-      dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  // if FindEx was used (choose_time), workspace size is set.
-  if (!(reuse_algo || use_cached || params->choose_time))
-  {
-    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
-                                                  APPLY_SPECIFIC(input),
-                                                  APPLY_SPECIFIC(kerns),
-                                                  desc,
-                                                  APPLY_SPECIFIC(output),
-                                                  algo,
-                                                  &worksize);
-    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
-      // Fallback to none algo if not supported
-      #ifdef DEBUG
-      if (0 != aesara_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n",
-              algorithm_name);
-      #endif
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
-                                                    APPLY_SPECIFIC(input),
-                                                    APPLY_SPECIFIC(kerns),
-                                                    desc,
-                                                    APPLY_SPECIFIC(output),
-                                                    algo,
-                                                    &worksize);
-    }
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-  if (params->choose_algo) {
-#ifdef DEBUG
-    if (0 != aesara_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
-      cuda_exit(c->ctx);
-      return 1;
-    }
-    fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
-            algorithm_name,
-            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor_op)" : "",
-            params->choose_time ? "(timed)": "" ,
-            reuse_algo ? "(reused)" : "",
-            use_cached ? "(cache)": "",
-            worksize,
-            hashkey.c_str()
-    );
-#endif
-#ifdef DEBUG_TIMING
-    if (!(reuse_algo || use_cached)) {
-        // We have selected an algorithm at runtime.
-        // `timer` still contains timing about selection step.
-        fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, timer.milliseconds);
-        if (n_selections > 1) {
-            fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
-                    n_selections, total_selection_time, total_selection_time / n_selections);
-        }
-    }
-#endif
-    if (!reuse_algo) {
-      // save for next time/cache
-      prev_algo.algo = algo;
-      prev_algo.wsSize = worksize;
-      prev_algo.mathType = mathtype;
-      // Add to the cache if we choose on shape change, or first time if
-      // we choose once.
-      if (!use_cached)
-        dnn_conv_update_cache(hashkey, prev_algo);
-      if (params->choose_once)
-        reuse_algo = 1;
-    }
-  } // params->choose_algo
-  {
-    gpudata *workspace = 0;
-    if (worksize != 0) {
-      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
-      if (workspace == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
-        cuda_exit(c->ctx);
-        return 1;
-      }
-    }
-    if (worksize != 0)
-      cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
-    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    #ifdef DEBUG_TIMING
-    GpuArray_sync(&(*output)->ga);
-    timer.start();
-    #endif
-    for ( int g = 0; g < groups; g++) {
-      err = cudnnConvolutionForward(
-        params->handle,
-        alpha_p,
-        APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
-        APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
-        desc, algo,
-        worksize == 0 ? NULL : *(void **)workspace, worksize,
-        beta_p,
-        APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
-    }
-    if (worksize != 0) {
-      cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
-      gpudata_release(workspace);
-    }
-    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  }
-  #ifdef DEBUG_TIMING
-  GpuArray_sync(&(*output)->ga);
-  timer.end();
-  total_computation_time += timer.milliseconds;
-  ++n_computations;
-  #endif
-  cuda_exit(c->ctx);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv FWD operation: %s",
-		 cudnnGetErrorString(err));
-    return 1;
-  }
-  #ifdef DEBUG_TIMING
-  fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
-  if (n_computations > 1) {
-    fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
-            n_computations, total_computation_time, total_computation_time / n_computations);
-  }
-  #endif
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_gi.c
+++ b/aesara/gpuarray/c_code/dnn_gi.c
-#section init_code_struct
-prev_algo.algo = PARAMS->conv_algo;
-prev_algo.mathType = CUDNN_DEFAULT_MATH;
-reuse_algo = 0;
-hash_prefix = std::string("GI|GPU#");
-#ifdef DEBUG_TIMING
-total_computation_time = 0;
-total_selection_time = 0;
-n_computations = 0;
-n_selections = 0;
-if (PARAMS->choose_algo) {
-    if (PARAMS->choose_time) {
-        selection_name = "fastest";
-    } else {
-        selection_name = "best suited";
-    }
-};
-#endif
-#section support_code_struct
-#line 22 "dnn_gi.c"
-int     reuse_algo;
-AlgoRec prev_algo;
-std::string hash_prefix;
-#define AESARA_DONT_MEMSET_STRUCT
-#ifdef DEBUG
-char algorithm_name[128];
-#endif
-#ifdef DEBUG_TIMING
-double total_computation_time;
-double total_selection_time;
-size_t n_computations;
-size_t n_selections;
-const char* selection_name;
-#endif
-/** Check given algorithm against inputs and convolution descriptor,
-    change algorithm inplace to a fallback algorithm if checkings fail.
-    Return 0 on success, non-0 on error. **/
-int dnn_conv_gi_fallback(cudnnConvolutionBwdDataAlgo_t* _algo,
-                         const PyGpuArrayObject* input,
-                         const PyGpuArrayObject* kerns,
-                         cudnnConvolutionDescriptor_t desc) {
-  cudnnConvolutionBwdDataAlgo_t algo = *_algo;
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-        #ifdef DEBUG
-        fprintf(stderr, "(replacing gradinput algo fft with none)\n");
-        #endif
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-        #ifdef DEBUG
-        fprintf(stderr, "(replacing gradinput algo fft_tiling with none)\n");
-        #endif
-      }
-    }
-  }
-  *_algo = algo;
-  return 0;
-}
-int
-APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
-                        PyGpuArrayObject *im,
-                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **input,
-                        PARAMS_TYPE* params) {
-  PyGpuContextObject *c = kerns->context;
-  void *alpha_p;
-  void *beta_p;
-  float af = alpha, bf = beta;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  bool use_cached = 0;
-  #ifdef DEBUG
-  if (_cppver) fprintf(stderr, "%s\n", _cppver);
-  #endif
-  #ifdef DEBUG_TIMING
-  AesaraTimer timer;
-  #endif
-  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
-    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
-                    "stack size");
-    return 1;
-  }
-  if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) {
-    PyErr_SetString(PyExc_ValueError,
-		    "Number of filters must be divisible by number of groups");
-    return 1;
-  }
-  switch (im->ga.typecode) {
-  case GA_DOUBLE:
-    alpha_p = (void *)&alpha;
-    beta_p = (void *)&beta;
-    break;
-  case GA_FLOAT:
-  case GA_HALF:
-    alpha_p = (void *)&af;
-    beta_p = (void *)&bf;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
-    return 1;
-  }
-  if (params->inplace) {
-    Py_XDECREF(*input);
-    *input = im;
-    Py_INCREF(*input);
-  } else {
-    if (aesara_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
-                           im->ga.typecode, GA_C_ORDER, c) != 0)
-      return 1;
-    if (beta != 0.0 && pygpu_move(*input, im))
-      return 1;
-  }
-  if (PyGpuArray_DIMS(im)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) {
-    int err2 = GpuArray_memset(&(*input)->ga, 0);
-    if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuDnnConv grad wrt. inputs could not fill the output with zeros: %d", err2);
-        return 1;
-    }
-    return 0;
-  }
-  int groups = c_get_groups_for_conv(desc, params->num_groups);
-  if (groups == -1)
-    return 1;
-  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
-    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
-    return 1;
-  if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), groups) == -1)
-    return 1;
-  if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                        PyGpuArray_NDIM(kerns), output, groups))
-    return 1;
-  size_t input_offset = PyGpuArray_STRIDE(*input, 0) / groups;
-  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
-  size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
-  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
-  size_t worksize = 0;
-  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
-  std::string hashkey;
-  cuda_enter(c->ctx);
-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) {
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (params->choose_algo) {
-    if (!reuse_algo) {
-      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
-      // check out cache
-      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
-      if (hashkey.empty()) {
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
-      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
-      if (cached) {
-        prev_algo = *cached;
-        use_cached = 1;
-      }
-    }
-    if (reuse_algo || use_cached) {
-      algo = (cudnnConvolutionBwdDataAlgo_t)prev_algo.algo;
-      worksize = prev_algo.wsSize;
-      mathtype = prev_algo.mathType;
-    } else {
-      if (params->choose_time) {
-        int count;
-        cudnnConvolutionBwdDataAlgoPerf_t choice;
-        gpudata *tmpmem;
-        // set the 'tensor math ok' flag
-        if (im->ga.typecode == GA_HALF)
-          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
-        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
-        if (tmpmem == NULL) {
-          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-          cuda_exit(c->ctx);
-          return -1;
-        }
-        /* cudnnFindConvolutionBackwardDataAlgorithmEx() may write to output (input).
-           We don't want that if output is used in computation (ie. if beta != 0). */
-        PyGpuArrayObject* ip = *input;
-        if (beta != 0) {
-            ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None);
-        }
-        #ifdef DEBUG_TIMING
-        timer.start();
-        #endif
-        err = cudnnFindConvolutionBackwardDataAlgorithmEx(
-          params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip),
-          1, &count, &choice, *(void **)tmpmem, maxfree);
-        #ifdef DEBUG_TIMING
-        timer.end();
-        #endif
-        gpudata_release(tmpmem);
-        if (beta != 0) {
-            Py_XDECREF(ip);
-        }
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-        #ifdef DEBUG
-        if (count == 0) {
-            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
-            cuda_exit(c->ctx);
-            return 1;
-        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError, "error getting best-timed gradinput algo: %s",
-                         cudnnGetErrorString(choice.status));
-            cuda_exit(c->ctx);
-            return 1;
-        } // Else, count is necessarly 1 for current implementation.
-        #endif
-        algo = choice.algo;
-        worksize = choice.memory;
-#if CUDNN_MAJOR >= 7
-        if (im->ga.typecode == GA_HALF)
-          mathtype = choice.mathType;
-#endif
-      } else {
-        #ifdef DEBUG_TIMING
-        timer.start();
-        #endif
-        err = cudnnGetConvolutionBackwardDataAlgorithm(
-          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
-          desc, APPLY_SPECIFIC(input),
-          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
-        #ifdef DEBUG_TIMING
-        timer.end();
-        #endif
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-      }
-      #ifdef DEBUG_TIMING
-      total_selection_time += timer.milliseconds;
-      ++n_selections;
-      #endif
-    }
-  }
-  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
-      dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  // if FindEx was used (choose_time), workspace size is set.
-  if (!(reuse_algo || use_cached || params->choose_time))
-  {
-    err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-      params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(input), algo, &worksize);
-    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
-      // Fallback to none algo if not supported
-      #ifdef DEBUG
-      if (0 != aesara_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)\n",
-              algorithm_name);
-      #endif
-      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-        APPLY_SPECIFIC(input), algo, &worksize);
-    }
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }  // !(reuse_algo || use_cached || params->choose_time)
-  if (params->choose_algo) {
-#ifdef DEBUG
-    if (0 != aesara_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
-        cuda_exit(c->ctx);
-        return 1;
-    }
-    fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
-            algorithm_name,
-            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor_op)" : "",
-            params->choose_time ? "(timed)": "" ,
-            reuse_algo ? "(reused)" : "",
-            use_cached ? "(cache)": "",
-            worksize,
-            hashkey.c_str()
-    );
-#endif
-#ifdef DEBUG_TIMING
-    if (!(reuse_algo || use_cached)) {
-        // We have selected an algorithm at runtime.
-        // `timer` still contains timing about selection step.
-        fprintf(stderr, "\t(selected %s gradinput algo in %g milliseconds)\n", selection_name, timer.milliseconds);
-        if (n_selections > 1) {
-            fprintf(stderr, "\t(selected %lu gradinput algos in %g milliseconds (average: %g milliseconds per selection))\n",
-                    n_selections, total_selection_time, total_selection_time / n_selections);
-        }
-    }
-#endif
-    if (!reuse_algo) {
-      // save for next time/cache
-      prev_algo.algo = algo;
-      prev_algo.wsSize = worksize;
-      prev_algo.mathType = mathtype;
-      // Add to the cache
-      if (!use_cached)
-        dnn_conv_update_cache(hashkey, prev_algo);
-      if (params->choose_once)
-        reuse_algo = 1;
-    }
-  } // params->choose_algo
-  gpudata *workspace = 0;
-  if (worksize != 0) {
-    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
-    if (workspace == NULL) {
-      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-  if (worksize != 0)
-    cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
-  cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  #ifdef DEBUG_TIMING
-  GpuArray_sync(&(*input)->ga);
-  timer.start();
-  #endif
-  for ( int g = 0; g < groups; g++) {
-    err = cudnnConvolutionBackwardData(
-      params->handle,
-      alpha_p,
-      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
-      APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(output)) + output_offset * g,
-      desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
-      beta_p,
-      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(*input)) + input_offset * g);
-  }
-  if (worksize != 0) {
-    cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
-    gpudata_release(workspace);
-  }
-  cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  #ifdef DEBUG_TIMING
-  GpuArray_sync(&(*input)->ga);
-  timer.end();
-  total_computation_time += timer.milliseconds;
-  ++n_computations;
-  #endif
-  cuda_exit(c->ctx);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradinput operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  #ifdef DEBUG_TIMING
-  fprintf(stderr, "\t(ran gradinput algo in %g milliseconds)\n", timer.milliseconds);
-  if (n_computations > 1) {
-    fprintf(stderr, "\t(ran %lu gradinput computations in %g milliseconds (average: %g milliseconds per call))\n",
-            n_computations, total_computation_time, total_computation_time / n_computations);
-  }
-  #endif
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_gw.c
+++ b/aesara/gpuarray/c_code/dnn_gw.c
-#section init_code_struct
-prev_algo.algo = PARAMS->conv_algo;
-prev_algo.mathType = CUDNN_DEFAULT_MATH;
-reuse_algo = 0;
-hash_prefix = std::string("GW|GPU#");
-#ifdef DEBUG_TIMING
-total_computation_time = 0;
-total_selection_time = 0;
-n_computations = 0;
-n_selections = 0;
-if (PARAMS->choose_algo) {
-    if (PARAMS->choose_time) {
-        selection_name = "fastest";
-    } else {
-        selection_name = "best suited";
-    }
-};
-#endif
-#section support_code_struct
-#line 22 "dnn_gw.c"
-int     reuse_algo;
-AlgoRec prev_algo;
-std::string hash_prefix;
-#define AESARA_DONT_MEMSET_STRUCT
-#ifdef DEBUG
-char algorithm_name[128];
-#endif
-#ifdef DEBUG_TIMING
-double total_computation_time;
-double total_selection_time;
-size_t n_computations;
-size_t n_selections;
-const char* selection_name;
-#endif
-/** Check given algorithm against inputs and convolution descriptor,
-    change algorithm inplace to a fallback algorithm if checkings fail.
-    Return 0 on success, non-0 on error. **/
-int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
-                         const PyGpuArrayObject* input,
-                         const PyGpuArrayObject* kerns,
-                         cudnnConvolutionDescriptor_t desc) {
-  cudnnConvolutionBwdFilterAlgo_t algo = *_algo;
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024.
-  // If the chosen implementation is FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
-      PyGpuArray_NDIM(input) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      return 1;
-    }
-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-      #ifdef DEBUG
-      fprintf(stderr, "(replacing gradweight algo fft with none)\n");
-      #endif
-    }
-  }
-  *_algo = algo;
-  return 0;
-}
-int
-APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
-                        PyGpuArrayObject *km,
-                        cudnnConvolutionDescriptor_t desc,
-                        double alpha, double beta, PyGpuArrayObject **kerns,
-                        PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  void *alpha_p;
-  void *beta_p;
-  float af = alpha, bf = beta;
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  bool use_cached = 0;
-  #ifdef DEBUG
-  if (_cppver) fprintf(stderr, "%s\n", _cppver);
-  #endif
-  #ifdef DEBUG_TIMING
-  AesaraTimer timer;
-  #endif
-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuDnnConv images and kernel must have the same stack size");
-    return 1;
-  }
-  if ((PyGpuArray_DIMS(output)[1] % params->num_groups) != 0) {
-    PyErr_SetString(PyExc_ValueError,
-		    "Number of output channels must be divisible by number of groups");
-    return 1;
-  }
-  switch (input->ga.typecode) {
-  case GA_DOUBLE:
-    alpha_p = (void *)&alpha;
-    beta_p = (void *)&beta;
-    break;
-  case GA_FLOAT:
-  case GA_HALF:
-    alpha_p = (void *)&af;
-    beta_p = (void *)&bf;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution");
-    return 1;
-  }
-  if (params->inplace) {
-    Py_XDECREF(*kerns);
-    *kerns = km;
-    Py_INCREF(*kerns);
-  } else {
-    if (aesara_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
-                           km->ga.typecode, GA_C_ORDER, c) != 0)
-      return 1;
-    if (beta != 0.0 && pygpu_move(*kerns, km))
-      return 1;
-  }
-  if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(km)[0] == 0 || PyGpuArray_DIMS(km)[1] == 0) {
-    int err2 = GpuArray_memset(&(*kerns)->ga, 0);
-    if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuDnnConv grad wrt. weights could not fill the output with zeros: %d", err2);
-        return 1;
-    }
-    return 0;
-  }
-  int groups = c_get_groups_for_conv(desc, params->num_groups);
-  if (groups == -1)
-    return 1;
-  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
-    return 1;
-  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
-    return 1;
-  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
-    return 1;
-  if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                        PyGpuArray_NDIM(*kerns), output, groups))
-    return 1;
-  size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
-  size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
-  size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
-  cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
-  size_t worksize = 0;
-  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
-  std::string hashkey ;
-  cuda_enter(c->ctx);
-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) {
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (params->choose_algo) {
-    if (!reuse_algo) {
-      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
-      // check out cache
-      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
-      if (hashkey.empty()) {
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
-      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
-      if (cached) {
-        prev_algo = *cached;
-        use_cached = 1;
-      }
-    }
-    if (reuse_algo || use_cached) {
-      algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
-      worksize = prev_algo.wsSize;
-      mathtype = prev_algo.mathType;
-    } else {
-      if (params->choose_time) {
-        int count;
-        cudnnConvolutionBwdFilterAlgoPerf_t choice;
-        gpudata *tmpmem;
-        // set the 'tensor math ok' flag
-        if (input->ga.typecode == GA_HALF)
-          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
-        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
-        if (tmpmem == NULL) {
-          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-          cuda_exit(c->ctx);
-          return -1;
-        }
-        /* cudnnFindConvolutionBackwardFilterAlgorithmEx() may write to kernels output (kerns).
-           We don't want that if output is used in computation (ie. if beta != 0). */
-        PyGpuArrayObject* k = *kerns;
-        if (beta != 0) {
-            k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None);
-        }
-        #ifdef DEBUG_TIMING
-        timer.start();
-        #endif
-        err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
-          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k),
-          1, &count, &choice, *(void **)tmpmem, maxfree);
-        #ifdef DEBUG_TIMING
-        timer.end();
-        #endif
-        gpudata_release(tmpmem);
-        if (beta != 0) {
-            Py_XDECREF(k);
-        }
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-        #ifdef DEBUG
-        if (count == 0) {
-            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
-            cuda_exit(c->ctx);
-            return 1;
-        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "error getting best-timed gradweight algo: %s",
-                         cudnnGetErrorString(choice.status));
-            cuda_exit(c->ctx);
-            return 1;
-        } // Else, count is necessarly 1 for current implementation.
-        #endif
-        algo = choice.algo;
-        worksize = choice.memory;
-#if CUDNN_MAJOR >= 7
-        if (input->ga.typecode == GA_HALF)
-          mathtype = choice.mathType;
-#endif
-      } else {
-        #ifdef DEBUG_TIMING
-        timer.start();
-        #endif
-        err = cudnnGetConvolutionBackwardFilterAlgorithm(
-          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
-          desc, APPLY_SPECIFIC(kerns),
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
-        #ifdef DEBUG_TIMING
-        timer.end();
-        #endif
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-      }
-      #ifdef DEBUG_TIMING
-      total_selection_time += timer.milliseconds;
-      ++n_selections;
-      #endif
-    }
-  } /* choose_algo */
-  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
-      dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  // if FindEx was used (choose_time), workspace size is set.
-  if (!(reuse_algo || use_cached || params->choose_time))
-  {
-    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-      params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(kerns), algo, &worksize);
-    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
-      // Fallback to none algo if not supported
-#ifdef DEBUG
-      if (0 != aesara_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
-              algorithm_name);
-#endif
-      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-      err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-        APPLY_SPECIFIC(kerns), algo, &worksize);
-    }
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-  if (params->choose_algo) {
-#ifdef DEBUG
-    if (0 != aesara_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
-      cuda_exit(c->ctx);
-      return 1;
-    }
-    fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
-            algorithm_name,
-            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor_op)" : "",
-            params->choose_time ? "(timed)": "" ,
-            reuse_algo ? "(reused)" : "",
-            use_cached ? "(cache)": "",
-            worksize,
-            hashkey.c_str()
-     );
-#endif
-#ifdef DEBUG_TIMING
-    if (!(reuse_algo || use_cached)) {
-        // We have selected an algorithm at runtime.
-        // `timer` still contains timing about selection step.
-        fprintf(stderr, "\t(selected %s gradweight algo in %g milliseconds)\n", selection_name, timer.milliseconds);
-        if (n_selections > 1) {
-            fprintf(stderr, "\t(selected %lu gradweight algos in %g milliseconds (average: %g milliseconds per selection))\n",
-                    n_selections, total_selection_time, total_selection_time / n_selections);
-        }
-    }
-#endif
-    if (!reuse_algo) {
-      // save for next time/cache
-      prev_algo.algo = algo;
-      prev_algo.wsSize = worksize;
-      prev_algo.mathType = mathtype;
-      // Add to the cache if we choose on shape change, or first time if
-      // we choose once.
-      if (!use_cached)
-        dnn_conv_update_cache(hashkey, prev_algo);
-      if (params->choose_once)
-        reuse_algo = 1;
-    }
-  } // params->choose_algo
-  gpudata *workspace = 0;
-  if (worksize != 0) {
-    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
-    if (workspace == NULL) {
-      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
-  if (worksize != 0)
-    cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
-  cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  #ifdef DEBUG_TIMING
-  GpuArray_sync(&(*kerns)->ga);
-  timer.start();
-  #endif
-  for ( int g = 0; g < groups; g++) {
-    err = cudnnConvolutionBackwardFilter(
-      params->handle,
-      alpha_p,
-      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g ,
-      APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(output)) + output_offset * g,
-      desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
-      beta_p,
-      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(*kerns)) + kern_offset * g);
-  }
-  if (worksize != 0) {
-    cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
-    gpudata_release(workspace);
-  }
-  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
-  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  #ifdef DEBUG_TIMING
-  GpuArray_sync(&(*kerns)->ga);
-  timer.end();
-  total_computation_time += timer.milliseconds;
-  ++n_computations;
-  #endif
-  cuda_exit(c->ctx);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  #ifdef DEBUG_TIMING
-  fprintf(stderr, "\t(ran gradweight algo in %g milliseconds)\n", timer.milliseconds);
-  if (n_computations > 1) {
-    fprintf(stderr, "\t(ran %lu gradweight computations in %g milliseconds (average: %g milliseconds per call))\n",
-            n_computations, total_computation_time, total_computation_time / n_computations);
-  }
-  #endif
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_pool.c
+++ b/aesara/gpuarray/c_code/dnn_pool.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
-cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
-#section init_code_struct
-cudnnStatus_t APPLY_SPECIFIC(err);
-APPLY_SPECIFIC(input) = NULL;
-APPLY_SPECIFIC(output) = NULL;
-APPLY_SPECIFIC(pool) = NULL;
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
-                "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
-if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
-if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
-#section support_code_struct
-int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
-                             PyArrayObject *ws,
-                             PyArrayObject *stride,
-                             PyArrayObject *pad,
-                             PyGpuArrayObject **out,
-                             PARAMS_TYPE* params) {
-  PyGpuContextObject *c = img->context;
-  size_t dims[5];
-  cudnnStatus_t err;
-  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
-    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-    return 1;
-  }
-  cudnnPoolingMode_t mode;
-  int w[3];
-  int p[3];
-  int s[3];
-  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
-  for(int i = 0; i < ndims; i++) {
-     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
-  }
-  for(int i = 0; i < ndims; i++) {
-     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
-  }
-  for(int i = 0; i < ndims; i++) {
-     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
-  }
-  dims[0] = PyGpuArray_DIM(img, 0);
-  dims[1] = PyGpuArray_DIM(img, 1);
-  dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
-  dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
-  if (ndims == 3)
-    dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
-  if (aesara_prep_output(out, ndims+2, dims, img->ga.typecode,
-                         GA_C_ORDER, c) != 0)
-    return 1;
-  // if input batch is empty, we return the empty output without calling cuDNN
-  // (which will fail on zero batch size).
-  if (PyGpuArray_DIM(*out, 0) == 0)
-    return 0;
-  if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
-    return 1;
-  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
-    return 1;
-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
-    return 1;
-  }
-  {
-    const float alphaf = 1;
-    const float betaf = 0;
-    const double alphad = 1;
-    const double betad = 0;
-    void *alpha, *beta;
-    switch (img->ga.typecode) {
-    case GA_DOUBLE:
-      alpha = (void *)&alphad;
-      beta = (void *)&betad;
-      break;
-    case GA_FLOAT:
-    case GA_HALF:
-      alpha = (void *)&alphaf;
-      beta = (void *)&betaf;
-      break;
-    default:
-      PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
-      return 1;
-    }
-    cuda_enter(c->ctx);
-    cuda_wait(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    err = cudnnPoolingForward(
-      params->handle, APPLY_SPECIFIC(pool),
-      alpha,
-      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
-      beta,
-      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
-    cuda_record(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    cuda_exit(c->ctx);
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "GpuDnnPool: error doing cudnnPoolingForward operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_pool_grad.c
+++ b/aesara/gpuarray/c_code/dnn_pool_grad.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
-cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
-#section init_code_struct
-APPLY_SPECIFIC(input) = NULL;
-APPLY_SPECIFIC(input_grad) = NULL;
-APPLY_SPECIFIC(output) = NULL;
-APPLY_SPECIFIC(output_grad) = NULL;
-APPLY_SPECIFIC(pool) = NULL;
-{
-  cudnnStatus_t err;
-  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError,
-                 "could not allocate tensor descriptor (input): %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError,
-                 "could not allocate tensor descriptor (input_grad): %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError,
-                 "could not allocate tensor descriptor (output): %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError,
-                 "could not allocate tensor descriptor (output_grad): %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
-                "(pool): %s", cudnnGetErrorString(err));
-    FAIL;
-  }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
-if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
-if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
-if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
-if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
-#section support_code_struct
-int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
-                                  PyGpuArrayObject *out,
-                                  PyGpuArrayObject *out_grad,
-                                  PyArrayObject *ws,
-                                  PyArrayObject *stride,
-                                  PyArrayObject *pad,
-                                  PyGpuArrayObject **inp_grad,
-                                  PARAMS_TYPE* params) {
-  PyGpuContextObject *c = inp->context;
-  cudnnStatus_t err;
-  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
-    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-    return 1;
-  }
-  if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
-    PyErr_SetString(PyExc_ValueError, "Only contiguous output gradients are supported.");
-    return 1;
-  }
-  if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
-    PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
-    return 1;
-  }
-  if (aesara_prep_output(inp_grad, PyGpuArray_NDIM(inp),
-                         PyGpuArray_DIMS(inp), inp->ga.typecode,
-                         GA_C_ORDER, c) != 0) {
-    return 1;
-  }
-  // if input batch is empty, we return the empty output without calling cuDNN
-  // (which will fail on zero batch size).
-  if (PyGpuArray_DIM(*inp_grad, 0) == 0)
-    return 0;
-  if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
-    return 1;
-  if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
-    return 1;
-  if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
-    return 1;
-  int w[3];
-  int p[3];
-  int s[3];
-  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
-  for(int i = 0; i < ndims; i++) {
-     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
-  }
-  for(int i = 0; i < ndims; i++) {
-     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
-  }
-  for(int i = 0; i < ndims; i++) {
-     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
-  }
-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), params->mode, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
-  }
-  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
-    return 1;
-  {
-    const float alphaf = 1;
-    const float betaf = 0;
-    const double alphad = 1;
-    const double betad = 0;
-    void *alpha, *beta;
-    switch (inp->ga.typecode) {
-    case GA_DOUBLE:
-      alpha = (void *)&alphad;
-      beta = (void *)&betad;
-      break;
-    case GA_FLOAT:
-    case GA_HALF:
-      alpha = (void *)&alphaf;
-      beta = (void *)&betaf;
-      break;
-    default:
-      PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling gradient");
-      return 1;
-    }
-    cuda_enter(c->ctx);
-    cuda_wait(out->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait(out_grad->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait(inp->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    err = cudnnPoolingBackward(
-      params->handle, APPLY_SPECIFIC(pool),
-      alpha,
-      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
-      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
-      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
-      beta,
-      APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
-      );
-    cuda_record(out->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record(out_grad->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record(inp->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    cuda_exit(c->ctx);
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_redux.c
+++ b/aesara/gpuarray/c_code/dnn_redux.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
-cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
-GpuElemwise* elemwise;
-gpuelemwise_arg arg;
-#section init_code_struct
-cudnnStatus_t APPLY_SPECIFIC(err);
-APPLY_SPECIFIC(input) = NULL;
-APPLY_SPECIFIC(output) = NULL;
-APPLY_SPECIFIC(red) = NULL;
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate reduction descriptor"
-               "(red): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
-  FAIL;
-}
-elemwise = NULL;
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
-if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
-if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }
-if (elemwise) {
-    GpuElemwise_free(elemwise);
-    elemwise = NULL;
-}
-#section support_code_struct
-int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
-                              PyGpuArrayObject **output,
-                              PyGpuArrayObject **indices,
-                              PARAMS_TYPE* params) {
-  PyGpuContextObject *c = input->context;
-  gpudata *workspace = NULL;
-  size_t worksize = 0;
-  size_t indsize = 0;
-  size_t *tdims;
-  ssize_t *tstrs;
-  size_t dims[8];
-  ssize_t strs[8];
-  size_t rsz;
-  void *alpha;
-  void *beta;
-  cudnnStatus_t err;
-  unsigned int p;
-  int e;
-  static float falpha = 1.0f;
-  static double dalpha = 1.0;
-  static float fbeta = 0.0f;
-  static double dbeta = 0.0;
-  if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
-    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-    return 1;
-  }
-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
-    return 1;
-  p = 0;
-  rsz = 1;
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-    if (!(params->c_axis & (1U << i))) {
-      dims[p] = PyGpuArray_DIM(input, i);
-      p++;
-    } else {
-      rsz *= PyGpuArray_DIM(input, i);
-    }
-  }
-  if (indices != NULL) {
-    if (aesara_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
-      return 1;
-    indsize = PyGpuArray_SIZE(*indices) * 4;
-  }
-  if (p == input->ga.nd || rsz == 1) {
-    int err;
-    Py_XDECREF(*output);
-    *output = pygpu_copy(input, GA_C_ORDER);
-    if (*output == NULL)
-      return 1;
-    err = GpuArray_reshape_inplace(&(*output)->ga, p, dims, GA_C_ORDER);
-    if (err != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
-      return 1;
-    }
-    if (rsz == 1) {
-      /* We must reduce some dimensions which have all size 1.
-       * cuDNN (up to 7004) does not support this case. Let's use GpuElemwise. */
-      switch (params->red_op) {
-        // Nothing to do for following cases.
-        case CUDNN_REDUCE_TENSOR_ADD: break;
-        case CUDNN_REDUCE_TENSOR_MUL: break;
-        case CUDNN_REDUCE_TENSOR_MIN: break;
-        case CUDNN_REDUCE_TENSOR_MAX: break;
-        case CUDNN_REDUCE_TENSOR_AVG: break;
-        /* Work to do for following cases.
-        AMAX (maximum on absolute values) => apply abs(output)
-        NORM1 (addition of absolute values) => apply abs(output)
-        NORM2 (square root of sum of squares) => sqroot(output^2) => abs(output)
-        So, we must apply abs(output) for all following cases.
-        */
-        case CUDNN_REDUCE_TENSOR_AMAX:
-        case CUDNN_REDUCE_TENSOR_NORM1:
-        case CUDNN_REDUCE_TENSOR_NORM2:
-        {
-            if (elemwise == NULL) {
-              arg.name = "out";
-              arg.typecode = (*output)->ga.typecode;
-              arg.flags = GE_READ | GE_WRITE;
-              elemwise = GpuElemwise_new(c->ctx, "", "out = (out < 0 ? -out : out)", 1, &arg, p, GE_CONVERT_F16);
-              if (!elemwise) {
-                  PyErr_SetString(PyExc_RuntimeError, "Unable to create GpuElemwise for output.");
-                  return 1;
-              }
-            }
-            void* args[1] = { (void*)&(*output)->ga };
-            int err = GpuElemwise_call(elemwise, args, 0);
-            if (err != GA_NO_ERROR) {
-                PyErr_SetString(PyExc_RuntimeError, "Unable to call GpuElemwise on output.");
-                return 1;
-            };
-        }
-            break;
-        default: break;
-      }
-    }
-    if (indices != NULL) {
-      // All indices will be 0 since the size of the reduced area is 1.
-      err = GpuArray_memset(&(*indices)->ga, 0);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError, "GpuArray_memset: %s", GpuArray_error(&(*indices)->ga, err));
-        return 1;
-      }
-    }
-    // This is a shortcut path.
-    return 0;
-  }
-  if (aesara_prep_output(output, p, dims, input->ga.typecode,
-                         GA_C_ORDER, c) != 0)
-    return 1;
-  // cuDNN expect that the output has the same number of dimension as
-  // the input, but the dimensions to reduce are of size 1 in the output.
-  // We have to do some trickery to be able to pass it what it need.
-  p = 0;
-  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-    if (params->c_axis & (1U << i)) {
-      dims[i] = 1;
-      strs[i] = 0;
-    } else {
-      dims[i] = PyGpuArray_DIM(input, i);
-      strs[i] = PyGpuArray_STRIDE(*output, p);
-      p++;
-    }
-  }
-  // Perform horrible surgery to be able to reuse c_set_tensorNd()
-  tdims = (*output)->ga.dimensions;
-  tstrs = (*output)->ga.strides;
-  (*output)->ga.dimensions = dims;
-  (*output)->ga.strides = strs;
-  (*output)->ga.nd = input->ga.nd;
-  // Delay error checking to avoid exposing a broken object
-  e = c_set_tensorNd(*output, APPLY_SPECIFIC(output));
-  // Undo our horrible surgery
-  (*output)->ga.nd = p;
-  (*output)->ga.dimensions = tdims;
-  (*output)->ga.strides = tstrs;
-  if (e != 0)
-    return 1;
-  // Back to normal, no more horrible things
-  // Note that only CUDNN_32BIT_INDICES is implemented
-  err = cudnnSetReduceTensorDescriptor(
-    APPLY_SPECIFIC(red), params->red_op,
-    params->acc_dtype, CUDNN_PROPAGATE_NAN,
-    indices == NULL ? CUDNN_REDUCE_TENSOR_NO_INDICES : CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
-    CUDNN_32BIT_INDICES);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set reduce descriptor: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  switch (input->ga.typecode) {
-  case GA_FLOAT:
-  case GA_HALF:
-    alpha = &falpha;
-    beta = &fbeta;
-    break;
-  case GA_DOUBLE:
-    alpha = &dalpha;
-    beta = &dbeta;
-    break;
-  default:
-    PyErr_SetString(PyExc_RuntimeError, "Unsupported dtype in dnn reduce");
-    return 1;
-  }
-  err = cudnnGetReductionWorkspaceSize(params->handle,
-                                       APPLY_SPECIFIC(red),
-                                       APPLY_SPECIFIC(input),
-                                       APPLY_SPECIFIC(output),
-                                       &worksize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not get reduce workspace size: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  if (worksize != 0) {
-    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, &e);
-    if (workspace == NULL) {
-      PyErr_Format(PyExc_RuntimeError, "gpudata_alloc: %s",
-                   gpucontext_error(c->ctx, e));
-      return 1;
-    }
-  }
-  err = cudnnReduceTensor(params->handle, APPLY_SPECIFIC(red),
-                          indices ? PyGpuArray_DEV_DATA(*indices) : NULL, indsize,
-                          worksize ? *((void **)workspace) : NULL, worksize,
-                          alpha,
-                          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-                          beta,
-                          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
-  if (workspace != NULL)
-    gpudata_release(workspace);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not run reduction: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_rnn_desc.c
+++ b/aesara/gpuarray/c_code/dnn_rnn_desc.c
-#section support_code
-int dnn_rnn_desc(int hidden_size, int num_layers,
-                 cudnnDropoutDescriptor_t ddesc,
-                 int input_mode, int direction_mode, int rnn_mode,
-                 int dtype, cudnnRNNDescriptor_t *odesc,
-                 cudnnHandle_t _handle) {
-  cudnnRNNDescriptor_t desc;
-  cudnnDataType_t data_type;
-  cudnnStatus_t err;
-  switch (dtype) {
-  case GA_FLOAT:
-    data_type = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    data_type = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    data_type = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
-    return -1;
-  }
-  err = cudnnCreateRNNDescriptor(&desc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
-    return -1;
-  }
-  #if CUDNN_MAJOR < 7
-  err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
-                              (cudnnRNNInputMode_t)input_mode,
-                              (cudnnDirectionMode_t)direction_mode,
-                              (cudnnRNNMode_t)rnn_mode, data_type);
-  #else
-  err = cudnnSetRNNDescriptor(_handle, desc, hidden_size, num_layers, ddesc,
-                              (cudnnRNNInputMode_t)input_mode,
-                              (cudnnDirectionMode_t)direction_mode,
-                              (cudnnRNNMode_t)rnn_mode, CUDNN_RNN_ALGO_STANDARD, data_type);
-  #endif
-  if (err != CUDNN_STATUS_SUCCESS) {
-    cudnnDestroyRNNDescriptor(desc);
-    PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
-    return -1;
-  }
-  *odesc = desc;
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_rnn_fwd.c
+++ b/aesara/gpuarray/c_code/dnn_rnn_fwd.c
-#section support_code
-int dnn_rnn_fwd(cudnnRNNDescriptor_t desc, uint32_t numDirs,
-                PyGpuArrayObject *w, PyGpuArrayObject *x,
-                PyGpuArrayObject *hx, PyGpuArrayObject *cx,
-                gpudata **reserve, PyGpuArrayObject **y,
-                PyGpuArrayObject **hy, PyGpuArrayObject **cy,
-                cudnnHandle_t _handle) {
-  PyGpuContextObject *c = x->context;
-  cudnnTensorDescriptor_t xdesc = NULL;
-  cudnnTensorDescriptor_t hxdesc = NULL;
-  cudnnTensorDescriptor_t cxdesc = NULL;
-  cudnnTensorDescriptor_t ydesc = NULL;
-  cudnnTensorDescriptor_t hydesc = NULL;
-  cudnnTensorDescriptor_t cydesc = NULL;
-  cudnnFilterDescriptor_t wdesc = NULL;
-  cudnnTensorDescriptor_t *xl = NULL;
-  cudnnTensorDescriptor_t *yl = NULL;
-  gpudata *workspace = NULL;
-  size_t worksize, ressize;
-  size_t seqLength = PyGpuArray_DIM(x, 0);
-  size_t miniBatch = PyGpuArray_DIM(x, 1);
-  size_t inputSize = PyGpuArray_DIM(x, 2);
-  size_t hiddenSize = PyGpuArray_DIM(hx, 2);
-  size_t shape[3];
-  int strs[3], dims[3];
-  cudnnStatus_t err;
-  cudnnDataType_t dt;
-  int res = -1;
-  switch (x->ga.typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    dt = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(c->ctx);
-  err = cudnnCreateTensorDescriptor(&xdesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create xdesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  dims[0] = PyGpuArray_DIM(x, 1);
-  dims[1] = PyGpuArray_DIM(x, 2);
-  dims[2] = 1;
-  strs[0] = dims[1] * dims[2];
-  strs[1] = dims[2];
-  strs[2] = 1;
-  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not set xdesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  if (c_make_tensorNd(hx, &hxdesc) != 0)
-    goto fail;
-  if (cx != NULL)
-    if (c_make_tensorNd(cx, &cxdesc) != 0)
-      goto fail;
-  if (c_make_filter(w, &wdesc) != 0)
-    goto fail;
-  shape[0] = seqLength;
-  shape[1] = miniBatch;
-  shape[2] = hiddenSize * numDirs;
-  if (aesara_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
-    goto fail;
-  err = cudnnCreateTensorDescriptor(&ydesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create ydesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  dims[0] = shape[1];
-  dims[1] = shape[2];
-  dims[2] = 1;
-  strs[0] = dims[2] * dims[1];
-  strs[1] = dims[2];
-  strs[2] = 1;
-  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not set ydesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  if (aesara_prep_output(hy, 3, PyGpuArray_DIMS(hx),
-                         hx->ga.typecode, GA_C_ORDER, c) != 0)
-    goto fail;
-  if (c_make_tensorNd(*hy, &hydesc) != 0)
-    goto fail;
-  if (cy != NULL) {
-    if (aesara_prep_output(cy, 3, PyGpuArray_DIMS(cx),
-                           cx->ga.typecode, GA_C_ORDER, c) != 0)
-      goto fail;
-    if (c_make_tensorNd(*cy, &cydesc) != 0)
-      goto fail;
-  }
-  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
-  if (xl == NULL) {
-    PyErr_NoMemory();
-    goto fail;
-  }
-  for (size_t i = 0; i < seqLength; i++)
-    xl[i] = xdesc;
-  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
-  if (yl == NULL) {
-    PyErr_NoMemory();
-    goto fail;
-  }
-  for (size_t i = 0; i < seqLength; i++)
-    yl[i] = ydesc;
-  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
-                                 xl, &worksize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not get worksize: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
-  if (workspace == NULL) {
-    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
-    goto fail;
-  }
-  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
-                                       xl, &ressize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not get reserve size: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
-  if (*reserve == NULL) {
-    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
-    goto fail;
-  }
-  err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
-                                xl, PyGpuArray_DEV_DATA(x),
-                                hxdesc, PyGpuArray_DEV_DATA(hx),
-                                cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
-                                wdesc, PyGpuArray_DEV_DATA(w),
-                                yl, PyGpuArray_DEV_DATA(*y),
-                                hydesc, PyGpuArray_DEV_DATA(*hy),
-                                cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
-                                *(void **)workspace, worksize,
-                                *(void **)(*reserve), ressize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could run RNN: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  res = 0;
- fail:
-  if (xdesc != NULL)
-    cudnnDestroyTensorDescriptor(xdesc);
-  if (hxdesc != NULL)
-    cudnnDestroyTensorDescriptor(hxdesc);
-  if (cxdesc != NULL)
-    cudnnDestroyTensorDescriptor(cxdesc);
-  if (wdesc != NULL)
-    cudnnDestroyFilterDescriptor(wdesc);
-  if (ydesc != NULL)
-    cudnnDestroyTensorDescriptor(ydesc);
-  if (hydesc != NULL)
-    cudnnDestroyTensorDescriptor(hydesc);
-  if (cydesc != NULL)
-    cudnnDestroyTensorDescriptor(cydesc);
-  free(xl);
-  free(yl);
-  if (workspace != NULL)
-    gpudata_release(workspace);
-  cuda_exit(c->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/dnn_rnn_gi.c
+++ b/aesara/gpuarray/c_code/dnn_rnn_gi.c
-#section support_code
-int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
-               PyGpuArrayObject *y, PyGpuArrayObject *dy,
-               PyGpuArrayObject *w, PyGpuArrayObject *hx,
-               gpudata *reserve, PyGpuArrayObject *cx,
-               PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
-               gpudata **oreserve, PyGpuArrayObject **dx,
-               PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
-               cudnnHandle_t _handle) {
-  PyGpuContextObject *c = y->context;
-  cudnnTensorDescriptor_t ydesc = NULL;
-  cudnnTensorDescriptor_t dhydesc = NULL;
-  cudnnTensorDescriptor_t dcydesc = NULL;
-  cudnnFilterDescriptor_t wdesc = NULL;
-  cudnnTensorDescriptor_t hxdesc = NULL;
-  cudnnTensorDescriptor_t cxdesc = NULL;
-  cudnnTensorDescriptor_t dxdesc = NULL;
-  cudnnTensorDescriptor_t dhxdesc = NULL;
-  cudnnTensorDescriptor_t dcxdesc = NULL;
-  cudnnTensorDescriptor_t *yl = NULL;
-  cudnnTensorDescriptor_t *dxl = NULL;
-  gpudata *workspace = NULL;
-  size_t worksize, ressize;
-  size_t seqLength = PyGpuArray_DIM(y, 0);
-  size_t miniBatch = PyGpuArray_DIM(y, 1);
-  size_t inputSize = xshp;
-  size_t shape[3];
-  int dims[3], strs[3];
-  cudnnStatus_t err;
-  cudnnDataType_t dt;
-  int res = -1;
-  switch (y->ga.typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    dt = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
-    return -1;
-  }
-  cuda_enter(c->ctx);
-  err = cudnnCreateTensorDescriptor(&ydesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create ydesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  /* We need to use the last two dimensions for this, this is not a typo */
-  dims[0] = PyGpuArray_DIM(y, 1);
-  dims[1] = PyGpuArray_DIM(y, 2);
-  dims[2] = 1;
-  strs[0] = dims[2] * dims[1];
-  strs[1] = dims[2];
-  strs[2] = 1;
-  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not set ydesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  if (dhy != NULL)
-    if (c_make_tensorNd(dhy, &dhydesc) != 0)
-      goto fail;
-  if (dcy != NULL)
-    if (c_make_tensorNd(dcy, &dcydesc) != 0)
-      goto fail;
-  if (c_make_filter(w, &wdesc) != 0)
-    goto fail;
-  if (c_make_tensorNd(hx, &hxdesc) != 0)
-    goto fail;
-  if (cx != NULL)
-    if (c_make_tensorNd(cx, &cxdesc) != 0)
-      goto fail;
-  shape[0] = seqLength;
-  shape[1] = miniBatch;
-  shape[2] = inputSize;
-  if (aesara_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
-    goto fail;
-  err = cudnnCreateTensorDescriptor(&dxdesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create dxdesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  /* Again not a typo, we need to use the last two dimensions */
-  dims[0] = shape[1];
-  dims[1] = shape[2];
-  dims[2] = 1;
-  strs[0] = dims[2] * dims[1];
-  strs[1] = dims[2];
-  strs[2] = 1;
-  err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not set dxdesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  if (aesara_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
-                         GA_C_ORDER, c) != 0)
-    goto fail;
-  if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
-    goto fail;
-  if (cx != NULL) {
-    if (aesara_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
-                           GA_C_ORDER, c) != 0)
-      goto fail;
-    if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
-      goto fail;
-  }
-  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
-  if (yl == NULL) {
-    PyErr_NoMemory();
-    goto fail;
-  }
-  for (size_t i = 0; i < seqLength; i++)
-    yl[i] = ydesc;
-  dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
-  if (dxl == NULL) {
-    PyErr_NoMemory();
-    goto fail;
-  }
-  for (size_t i = 0; i < seqLength; i++)
-    dxl[i] = dxdesc;
-  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not get worksize: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
-  if (workspace == NULL) {
-    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
-    goto fail;
-  }
-  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
-                                       dxl, &ressize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not get reserve size: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  *oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
-  if (*oreserve == NULL) {
-    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
-    goto fail;
-  }
-  if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
-    goto fail;
-  }
-  err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
-                             yl, PyGpuArray_DEV_DATA(y),
-                             /* y and dy are the same shape */
-                             yl, PyGpuArray_DEV_DATA(dy),
-                             dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
-                             dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
-                             wdesc, PyGpuArray_DEV_DATA(w),
-                             hxdesc, PyGpuArray_DEV_DATA(hx),
-                             cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
-                             dxl, PyGpuArray_DEV_DATA(*dx),
-                             dhxdesc, PyGpuArray_DEV_DATA(*dhx),
-                             dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
-                             *(void **)workspace, worksize,
-                             *(void **)(*oreserve), ressize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could run RNN grad inputs: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  res = 0;
-fail:
-  if (ydesc != NULL)
-    cudnnDestroyTensorDescriptor(ydesc);
-  if (dhydesc != NULL)
-    cudnnDestroyTensorDescriptor(dhydesc);
-  if (dcydesc != NULL)
-    cudnnDestroyTensorDescriptor(dcydesc);
-  if (wdesc != NULL)
-    cudnnDestroyFilterDescriptor(wdesc);
-  if (hxdesc != NULL)
-    cudnnDestroyTensorDescriptor(hxdesc);
-  if (cxdesc != NULL)
-    cudnnDestroyTensorDescriptor(cxdesc);
-  if (dxdesc != NULL)
-    cudnnDestroyTensorDescriptor(dxdesc);
-  if (dhxdesc != NULL)
-    cudnnDestroyTensorDescriptor(dhxdesc);
-  if (dcxdesc != NULL)
-    cudnnDestroyTensorDescriptor(dcxdesc);
-  free(yl);
-  free(dxl);
-  if (workspace != NULL)
-    gpudata_release(workspace);
-  cuda_exit(c->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/dnn_rnn_gw.c
+++ b/aesara/gpuarray/c_code/dnn_rnn_gw.c
-#section support_code
-int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
-               PyGpuArrayObject *x, PyGpuArrayObject *hx,
-               PyGpuArrayObject *y, gpudata *reserve,
-               PyGpuArrayObject **dw, cudnnHandle_t _handle) {
-  PyGpuContextObject *c = x->context;
-  cudnnTensorDescriptor_t xdesc = NULL;
-  cudnnTensorDescriptor_t hxdesc = NULL;
-  cudnnTensorDescriptor_t ydesc = NULL;
-  cudnnFilterDescriptor_t dwdesc = NULL;
-  cudnnTensorDescriptor_t *xl = NULL;
-  cudnnTensorDescriptor_t *yl = NULL;
-  gpudata *workspace = NULL;
-  size_t worksize, ressize;
-  size_t iters = PyGpuArray_DIM(x, 0);
-  size_t wsize = _wsize;
-  int dims[3], strs[3];
-  cudnnStatus_t err;
-  cudnnDataType_t dt;
-  int res = -1;
-  switch (x->ga.typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    dt = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(c->ctx);
-  err = cudnnCreateTensorDescriptor(&xdesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create xdesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  /* We need to use the last two dimensions for this, this is not a typo */
-  dims[0] = PyGpuArray_DIM(x, 1);
-  dims[1] = PyGpuArray_DIM(x, 2);
-  dims[2] = 1;
-  strs[0] = dims[2] * dims[1];
-  strs[1] = dims[2];
-  strs[2] = 1;
-  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not set xdesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  if (c_make_tensorNd(hx, &hxdesc) != 0)
-    goto fail;
-  err = cudnnCreateTensorDescriptor(&ydesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not create ydesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  /* Again not a typo, we need to use the last two dimensions */
-  dims[0] = PyGpuArray_DIM(y, 1);
-  dims[1] = PyGpuArray_DIM(y, 2);
-  dims[2] = 1;
-  strs[0] = dims[2] * dims[1];
-  strs[1] = dims[2];
-  strs[2] = 1;
-  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not set ydesc: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  if (aesara_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
-    goto fail;
-  GpuArray_memset(&(*dw)->ga, 0);
-  if (c_make_filter(*dw, &dwdesc) != 0)
-    goto fail;
-  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
-  if (xl == NULL) {
-    PyErr_NoMemory();
-    goto fail;
-  }
-  for (size_t i = 0; i < iters; i++)
-    xl[i] = xdesc;
-  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
-  if (yl == NULL) {
-    PyErr_NoMemory();
-    goto fail;
-  }
-  for (size_t i = 0; i < iters; i++)
-    yl[i] = ydesc;
-  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
-                                 xl, &worksize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not get worksize: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
-  if (workspace == NULL) {
-    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
-    goto fail;
-  }
-  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
-                                       xl, &ressize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could not get reserve size: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
-                                xl, PyGpuArray_DEV_DATA(x),
-                                hxdesc, PyGpuArray_DEV_DATA(hx),
-                                yl, PyGpuArray_DEV_DATA(y),
-                                *(void **)workspace, worksize,
-                                dwdesc, PyGpuArray_DEV_DATA(*dw),
-                                *(void **)reserve, ressize);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "Could run RNN grad weights: %s",
-                 cudnnGetErrorString(err));
-    goto fail;
-  }
-  res = 0;
-fail:
-  if (xdesc != NULL)
-    cudnnDestroyTensorDescriptor(xdesc);
-  if (hxdesc != NULL)
-    cudnnDestroyTensorDescriptor(hxdesc);
-  if (ydesc != NULL)
-    cudnnDestroyTensorDescriptor(ydesc);
-  if (dwdesc != NULL)
-    cudnnDestroyFilterDescriptor(dwdesc);
-  free(xl);
-  free(yl);
-  if (workspace != NULL)
-    gpudata_release(workspace);
-  cuda_exit(c->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/dnn_rnn_paramsize.c
+++ b/aesara/gpuarray/c_code/dnn_rnn_paramsize.c
-#section support_code
-int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
-                      PyArrayObject *isize,
-                      npy_int32 typecode,
-                      npy_uint64 *oparam_size,
-                      cudnnHandle_t _handle) {
-  cudnnTensorDescriptor_t xdesc;
-  size_t param_size;
-  cudnnStatus_t err;
-  cudnnDataType_t dt;
-  int shape[3];
-  int strides[3];
-  if (PyArray_DIM(isize, 0) != 2) {
-    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
-    return -1;
-  }
-  switch (typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  case GA_HALF:
-    dt = CUDNN_DATA_HALF;
-    break;
-  default:
-    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
-    return -1;
-  }
-  err = cudnnCreateTensorDescriptor(&xdesc);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
-    return -1;
-  }
-  shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
-  shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
-  shape[2] = 1;
-  strides[0] = shape[2] * shape[1];
-  strides[1] = shape[2];
-  strides[2] = 1;
-  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    return -1;
-  }
-  err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
-    return -1;
-  }
-  cudnnDestroyTensorDescriptor(xdesc);
-  *oparam_size = param_size;
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_softmax.c
+++ b/aesara/gpuarray/c_code/dnn_softmax.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
-#section init_code_struct
-APPLY_SPECIFIC(input) = NULL;
-APPLY_SPECIFIC(output) = NULL;
-{
-  cudnnStatus_t err;
-  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(input) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
-if (APPLY_SPECIFIC(output) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
-#section support_code_struct
-int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
-                            PyGpuArrayObject **out,
-                            PARAMS_TYPE* wrapper) {
-  PyGpuContextObject *c = x->context;
-  cudnnStatus_t err;
-  if (aesara_prep_output(out, PyGpuArray_NDIM(x),
-                         PyGpuArray_DIMS(x), x->ga.typecode,
-                         GA_C_ORDER, c) != 0)
-    return 1;
-  // Directly return the output if any of the dimensions is 0.
-  // (cuDNN does not support zero-length dimensions.)
-  if (PyGpuArray_SIZE(*out) == 0)
-    return 0;
-  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
-    return 1;
-  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
-    return 1;
-  {
-    const float alphaf = 1;
-    const float betaf = 0;
-    const double alphad = 1;
-    const double betad = 0;
-    void *alpha, *beta;
-    switch (x->ga.typecode) {
-    case GA_DOUBLE:
-      alpha = (void *)&alphad;
-      beta = (void *)&betad;
-      break;
-    case GA_FLOAT:
-    case GA_HALF:
-      alpha = (void *)&alphaf;
-      beta = (void *)&betaf;
-      break;
-    default:
-      PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax");
-      return 1;
-    }
-    cuda_enter(c->ctx);
-    cuda_wait(x->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    err = cudnnSoftmaxForward(
-      wrapper->handle,
-      wrapper->algo,
-      wrapper->mode,
-      alpha,
-      APPLY_SPECIFIC(input),
-      PyGpuArray_DEV_DATA(x),
-      beta,
-      APPLY_SPECIFIC(output),
-      PyGpuArray_DEV_DATA(*out)
-    );
-    cuda_record(x->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    cuda_exit(c->ctx);
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_softmax_grad.c
+++ b/aesara/gpuarray/c_code/dnn_softmax_grad.c
-#section support_code_struct
-cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(dx);
-#section init_code_struct
-APPLY_SPECIFIC(dy) = NULL;
-APPLY_SPECIFIC(sm) = NULL;
-APPLY_SPECIFIC(dx) = NULL;
-{
-  cudnnStatus_t err;
-  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dx));
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
-                 cudnnGetErrorString(err));
-    FAIL;
-  }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(dy) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
-if (APPLY_SPECIFIC(sm) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
-if (APPLY_SPECIFIC(dx) != NULL)
-  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dx));
-#section support_code_struct
-int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
-                                 PyGpuArrayObject *sm,
-                                 PyGpuArrayObject **dx,
-                                 PARAMS_TYPE* wrapper) {
-  PyGpuContextObject *c = dy->context;
-  cudnnStatus_t err;
-  if (aesara_prep_output(dx, PyGpuArray_NDIM(dy),
-                         PyGpuArray_DIMS(dy), dy->ga.typecode,
-                         GA_C_ORDER, c) != 0)
-    return 1;
-  // Directly return the output if any of the dimensions is 0.
-  // (cuDNN does not support zero-length dimensions.)
-  if (PyGpuArray_SIZE(*dx) == 0)
-    return 0;
-  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
-    return 1;
-  if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
-    return 1;
-  if (c_set_tensorNd(*dx, APPLY_SPECIFIC(dx)) != 0)
-    return 1;
-  {
-    const float alphaf = 1;
-    const float betaf = 0;
-    const double alphad = 1;
-    const double betad = 0;
-    void *alpha, *beta;
-    switch (sm->ga.typecode) {
-    case GA_DOUBLE:
-      alpha = (void *)&alphad;
-      beta = (void *)&betad;
-      break;
-    case GA_FLOAT:
-    case GA_HALF:
-      alpha = (void *)&alphaf;
-      beta = (void *)&betaf;
-      break;
-    default:
-      PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax gradient");
-      return 1;
-    }
-    cuda_enter(c->ctx);
-    cuda_wait(sm->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait(dy->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    err = cudnnSoftmaxBackward(
-      wrapper->handle,
-      wrapper->algo,
-      wrapper->mode,
-      alpha,
-      APPLY_SPECIFIC(sm),
-      PyGpuArray_DEV_DATA(sm),
-      APPLY_SPECIFIC(dy),
-      PyGpuArray_DEV_DATA(dy),
-      beta,
-      APPLY_SPECIFIC(dx),
-      PyGpuArray_DEV_DATA(*dx)
-      );
-    cuda_record(sm->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record(dy->ga.data, GPUARRAY_CUDA_WAIT_READ);
-    cuda_record((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-    cuda_exit(c->ctx);
-  }
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_sptf_gi.c
+++ b/aesara/gpuarray/c_code/dnn_sptf_gi.c
-#section support_code_struct
-cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(dxdesc);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(dydesc);
-#section init_code_struct
-APPLY_SPECIFIC(sptf) = NULL;
-APPLY_SPECIFIC(xdesc) = NULL;
-APPLY_SPECIFIC(dxdesc) = NULL;
-APPLY_SPECIFIC(dydesc) = NULL;
-{
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
-    if (err != CUDNN_STATUS_SUCCESS)
-    {
-        PyErr_Format(PyExc_MemoryError,
-            "GpuDnnTransformerGradI: could not allocate spatial transformer descriptor (sptf): %s",
-            cudnnGetErrorString(err));
-        FAIL;
-    }
-    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
-    if ( err != CUDNN_STATUS_SUCCESS )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor xdesc: %s",
-            cudnnGetErrorString( err ) );
-        FAIL;
-    }
-    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dxdesc) );
-    if ( err != CUDNN_STATUS_SUCCESS )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dxdesc: %s",
-            cudnnGetErrorString( err ) );
-        FAIL;
-    }
-    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(dydesc) );
-    if ( err != CUDNN_STATUS_SUCCESS )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGradI: failed to allocate cuDNN tensor descriptor dydesc: %s",
-            cudnnGetErrorString( err ) );
-        FAIL;
-    }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(sptf) != NULL)
-    cudnnDestroySpatialTransformerDescriptor( APPLY_SPECIFIC(sptf) );
-if ( APPLY_SPECIFIC(xdesc) != NULL )
-    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
-if ( APPLY_SPECIFIC(dxdesc) != NULL )
-    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dxdesc) );
-if ( APPLY_SPECIFIC(dydesc) != NULL )
-    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(dydesc) );
-#section support_code_struct
-int
-APPLY_SPECIFIC(dnn_sptf_gi)(PyGpuArrayObject * input,
-                            PyGpuArrayObject * grid,
-                            PyGpuArrayObject * dy,
-                            PyGpuArrayObject ** input_grad,
-                            PyGpuArrayObject ** grid_grad,
-                            cudnnHandle_t _handle)
-{
-    PyGpuContextObject * gpu_ctx = input->context;
-    void * alpha_p;
-    void * beta_p;
-    double alpha = 1.0, beta = 0.0;
-    float af = alpha, bf = beta;
-    int out_dims[4];
-    cudnnDataType_t dt;
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    switch (input->ga.typecode)
-    {
-    case GA_DOUBLE:
-        alpha_p = (void *)&alpha;
-        beta_p = (void *)&beta;
-        dt = CUDNN_DATA_DOUBLE;
-        break;
-    case GA_FLOAT:
-        alpha_p = (void *)&af;
-        beta_p = (void *)&bf;
-        dt = CUDNN_DATA_FLOAT;
-        break;
-    case GA_HALF:
-        alpha_p = (void *)&af;
-        beta_p = (void *)&bf;
-        dt = CUDNN_DATA_HALF;
-        break;
-    default:
-        PyErr_SetString( PyExc_TypeError,
-            "GpuDnnTransformerGradI: unsupported type for input in spatial transformer gradients" );
-        return 1;
-    }
-    if ( grid->ga.typecode != GA_FLOAT &&
-         grid->ga.typecode != GA_DOUBLE &&
-         grid->ga.typecode != GA_HALF )
-    {
-        PyErr_SetString( PyExc_TypeError,
-            "GpuDnnTransformerGradI: unsupported data type for grid in spatial transformer gradients." );
-        return 1;
-    }
-    if ( aesara_prep_output( input_grad, PyGpuArray_NDIM( input ),
-                             PyGpuArray_DIMS( input ), input->ga.typecode,
-                             GA_C_ORDER, gpu_ctx ) != 0 )
-        return 1;
-    if ( aesara_prep_output( grid_grad, PyGpuArray_NDIM( grid ),
-                             PyGpuArray_DIMS( grid ), grid->ga.typecode,
-                             GA_C_ORDER, gpu_ctx ) != 0 )
-        return 1;
-    // Obtain output dimensions to setup descriptor
-    out_dims[0] = (int) PyGpuArray_DIM(input, 0); // num_images
-    out_dims[1] = (int) PyGpuArray_DIM(input, 1); // num_channels
-    out_dims[2] = (int) PyGpuArray_DIM(grid, 1); // grid height
-    out_dims[3] = (int) PyGpuArray_DIM(grid, 2); // grid width
-    // Currently, only the bilinear sampler is supported by cuDNN,
-    // so the sampler method is currently not available as a parameter
-    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
-        dt, 4, out_dims );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGradI: could not initialize descriptor (sptf): %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
-        return 1;
-    if ( c_set_tensorNd( dy, APPLY_SPECIFIC(dydesc) ) != 0 )
-        return 1;
-    if ( c_set_tensorNd( *input_grad, APPLY_SPECIFIC(dxdesc) ) != 0 )
-        return 1;
-    // Directly return the outputs if any of the dimensions is 0.
-    // (cuDNN does not support zero-length dimensions.)
-    if ( PyGpuArray_SIZE( *input_grad ) == 0 || PyGpuArray_SIZE( *grid_grad ) == 0 )
-        return 0;
-    cuda_enter( gpu_ctx->ctx );
-    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    cuda_wait( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    err = cudnnSpatialTfSamplerBackward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
-        APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), beta_p,
-        APPLY_SPECIFIC(dxdesc), PyGpuArray_DEV_DATA( *input_grad ), alpha_p,
-        APPLY_SPECIFIC(dydesc), PyGpuArray_DEV_DATA( dy ), PyGpuArray_DEV_DATA( grid ),
-        beta_p, PyGpuArray_DEV_DATA( *grid_grad ) );
-    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( dy->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*input_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    cuda_record( (*grid_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    cuda_exit( gpu_ctx->ctx );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformerGradI: failed to compute gradients of the inputs: %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_sptf_grid.c
+++ b/aesara/gpuarray/c_code/dnn_sptf_grid.c
-#section support_code_struct
-cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
-#section init_code_struct
-APPLY_SPECIFIC(sptf) = NULL;
-{
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
-    {
-        PyErr_Format(PyExc_MemoryError,
-            "GpuDnnTransformerGrid: could not allocate spatial transformer descriptor (sptf): %s",
-            cudnnGetErrorString(err));
-        FAIL;
-    }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(sptf) != NULL) { cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf)); }
-#section support_code_struct
-int
-APPLY_SPECIFIC(dnn_sptf_grid)(PyGpuArrayObject * theta,
-                              PyArrayObject * out_dims,
-                              PyGpuArrayObject ** grid,
-                              cudnnHandle_t _handle)
-{
-    PyGpuContextObject * gpu_ctx = theta->context;
-    size_t grid_dims[4];
-    int num_images, num_channels, height, width;
-    int desc_dims[4];
-    cudnnDataType_t dt;
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    switch(theta->ga.typecode)
-    {
-    case GA_DOUBLE:
-        dt = CUDNN_DATA_DOUBLE;
-        break;
-    case GA_FLOAT:
-        dt = CUDNN_DATA_FLOAT;
-        break;
-    case GA_HALF:
-        dt = CUDNN_DATA_HALF;
-        break;
-    default:
-        PyErr_SetString( PyExc_TypeError,
-            "GpuDnnTransformerGrid: unsupported data type for theta in spatial transformer." );
-        return 1;
-    }
-    if ( PyArray_NDIM( out_dims ) != 1 || PyArray_SIZE( out_dims ) != 4 )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-            "GpuDnnTransformerGrid: out_dims must have 4 elements." );
-        return 1;
-    }
-    // Obtain output dimensions
-    num_images = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 0 ) );
-    num_channels = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 1 ) );
-    height = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 2 ) );
-    width = (int) *( (npy_int64 *) PyArray_GETPTR1( out_dims, 3 ) );
-    if ( PyGpuArray_DIM( theta, 0 ) != num_images ||
-         PyGpuArray_DIM( theta, 1 ) != 2 || PyGpuArray_DIM( theta, 2 ) != 3 )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformerGrid: incorrect dimensions for theta, expected (%d, %d, %d), got (%d, %d, %d)",
-            num_images, 2, 3, PyGpuArray_DIMS( theta )[0],
-            PyGpuArray_DIMS( theta )[1], PyGpuArray_DIMS( theta )[2] );
-        return 1;
-    }
-    // Set transformed output dimensions to setup the descriptor
-    desc_dims[0] = num_images;
-    desc_dims[1] = num_channels;
-    desc_dims[2] = height;
-    desc_dims[3] = width;
-    // Set sampling grid dimensions
-    grid_dims[0] = num_images;
-    grid_dims[1] = height;
-    grid_dims[2] = width;
-    grid_dims[3] = 2;
-    // Currently, only the bilinear sampler is supported by cuDNN,
-    // so the sampler method is currently not available as a parameter
-    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
-        dt, 4, desc_dims );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    if ( aesara_prep_output( grid, 4, grid_dims, theta->ga.typecode,
-                             GA_C_ORDER, gpu_ctx ) != 0 )
-    {
-        PyErr_SetString( PyExc_RuntimeError,
-            "GpuDnnTransformerGrid: could not allocate memory for grid of coordinates" );
-        return 1;
-    }
-    cuda_enter( gpu_ctx->ctx );
-    cuda_wait( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    err = cudnnSpatialTfGridGeneratorForward( _handle, APPLY_SPECIFIC(sptf),
-        PyGpuArray_DEV_DATA( theta ), PyGpuArray_DEV_DATA( *grid ) );
-    cuda_record( theta->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*grid)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    cuda_exit( gpu_ctx->ctx );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformerGrid: could not create grid of coordinates: %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_sptf_gt.c
+++ b/aesara/gpuarray/c_code/dnn_sptf_gt.c
-#section support_code_struct
-cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
-#section init_code_struct
-APPLY_SPECIFIC(sptf) = NULL;
-{
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    if ((err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf))) != CUDNN_STATUS_SUCCESS)
-    {
-        PyErr_Format(PyExc_MemoryError,
-            "GpuDnnTransformerGradT: could not allocate spatial transformer descriptor (sptf): %s",
-            cudnnGetErrorString(err));
-        FAIL;
-    }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(sptf) != NULL)
-    cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
-#section support_code_struct
-int
-APPLY_SPECIFIC(dnn_sptf_gt)(PyGpuArrayObject * dgrid,
-                            PyGpuArrayObject ** dtheta,
-                            cudnnHandle_t _handle)
-{
-    PyGpuContextObject * gpu_ctx = dgrid->context;
-    int num_images, height, width;
-    int desc_dims[4];
-    size_t dtheta_dims[3];
-    cudnnDataType_t dt;
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    switch(dgrid->ga.typecode)
-    {
-    case GA_DOUBLE:
-        dt = CUDNN_DATA_DOUBLE;
-        break;
-    case GA_FLOAT:
-        dt = CUDNN_DATA_FLOAT;
-        break;
-    case GA_HALF:
-        dt = CUDNN_DATA_HALF;
-        break;
-    default:
-        PyErr_SetString( PyExc_TypeError,
-            "GpuDnnTransformerGradT: unsupported data type for dgrid in spatial transformer." );
-        return 1;
-    }
-    num_images = (int) PyGpuArray_DIM( dgrid, 0 );
-    height = (int) PyGpuArray_DIM( dgrid, 1 );
-    width = (int) PyGpuArray_DIM( dgrid, 2 );
-    dtheta_dims[0] = num_images;
-    dtheta_dims[1] = 2;
-    dtheta_dims[2] = 3;
-    if ( aesara_prep_output( dtheta, 3, dtheta_dims, dgrid->ga.typecode,
-                             GA_C_ORDER, gpu_ctx ) != 0 )
-        return 1;
-    desc_dims[0] = num_images;
-    // Assume number of channels is 1, because the information is not
-    // available or relevant here
-    desc_dims[1] = 1;
-    desc_dims[2] = height;
-    desc_dims[3] = width;
-    // Currently, only the bilinear sampler is supported by cuDNN,
-    // so the sampler method is currently not available as a parameter
-    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
-        dt, 4, desc_dims );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerGrid: could not initialize descriptor (sptf): %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    cuda_enter( gpu_ctx->ctx );
-    cuda_wait( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    err = cudnnSpatialTfGridGeneratorBackward( _handle, APPLY_SPECIFIC(sptf),
-        PyGpuArray_DEV_DATA( dgrid ), PyGpuArray_DEV_DATA( *dtheta ) );
-    cuda_record( dgrid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*dtheta)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    cuda_exit( gpu_ctx->ctx );
-    if ( err != CUDNN_STATUS_SUCCESS )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformerGradT: could not compute gradients of the affine transformation: %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    return 0;
-}
--- a/aesara/gpuarray/c_code/dnn_sptf_sampler.c
+++ b/aesara/gpuarray/c_code/dnn_sptf_sampler.c
-#section support_code_struct
-cudnnSpatialTransformerDescriptor_t APPLY_SPECIFIC(sptf);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(xdesc);
-cudnnTensorDescriptor_t APPLY_SPECIFIC(ydesc);
-#section init_code_struct
-APPLY_SPECIFIC(sptf) = NULL;
-APPLY_SPECIFIC(xdesc) = NULL;
-APPLY_SPECIFIC(ydesc) = NULL;
-{
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    err = cudnnCreateSpatialTransformerDescriptor(&APPLY_SPECIFIC(sptf));
-    if (err != CUDNN_STATUS_SUCCESS)
-    {
-        PyErr_Format(PyExc_MemoryError,
-            "GpuDnnTransformerSampler: could not allocate spatial transformer descriptor (sptf): %s",
-            cudnnGetErrorString( err ));
-        FAIL;
-    }
-    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(xdesc) );
-    if ( err != CUDNN_STATUS_SUCCESS )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor xdesc: %s",
-            cudnnGetErrorString( err ) );
-        FAIL;
-    }
-    err = cudnnCreateTensorDescriptor( &APPLY_SPECIFIC(ydesc) );
-    if ( err != CUDNN_STATUS_SUCCESS )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerSampler: failed to allocate cuDNN tensor descriptor ydesc: %s",
-            cudnnGetErrorString( err ) );
-        FAIL;
-    }
-}
-#section cleanup_code_struct
-if (APPLY_SPECIFIC(sptf) != NULL)
-    cudnnDestroySpatialTransformerDescriptor(APPLY_SPECIFIC(sptf));
-if ( APPLY_SPECIFIC(xdesc) != NULL )
-    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(xdesc) );
-if ( APPLY_SPECIFIC(ydesc) != NULL )
-    cudnnDestroyTensorDescriptor( APPLY_SPECIFIC(ydesc) );
-#section support_code_struct
-int
-APPLY_SPECIFIC(dnn_sptf_sampler)(PyGpuArrayObject * input,
-                                 PyGpuArrayObject * grid,
-                                 PyGpuArrayObject ** output,
-                                 cudnnHandle_t _handle)
-{
-    PyGpuContextObject * gpu_ctx = input->context;
-    void * alpha_p;
-    void * beta_p;
-    double alpha = 1.0, beta = 0.0;
-    float af = alpha, bf = beta;
-    size_t out_dims[4];
-    int desc_dims[4];
-    cudnnDataType_t dt;
-    cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-    switch (input->ga.typecode)
-    {
-    case GA_DOUBLE:
-        alpha_p = (void *)&alpha;
-        beta_p = (void *)&beta;
-        dt = CUDNN_DATA_DOUBLE;
-        break;
-    case GA_FLOAT:
-        alpha_p = (void *)&af;
-        beta_p = (void *)&bf;
-        dt = CUDNN_DATA_FLOAT;
-        break;
-    case GA_HALF:
-        alpha_p = (void *)&af;
-        beta_p = (void *)&bf;
-        dt = CUDNN_DATA_HALF;
-        break;
-    default:
-        PyErr_SetString( PyExc_TypeError,
-            "GpuDnnTransformer: unsupported type for input in spatial transformer." );
-        return 1;
-    }
-    out_dims[0] = (size_t) PyGpuArray_DIM(input, 0); // num_images
-    out_dims[1] = (size_t) PyGpuArray_DIM(input, 1); // num_channels
-    out_dims[2] = (size_t) PyGpuArray_DIM(grid, 1); // grid height
-    out_dims[3] = (size_t) PyGpuArray_DIM(grid, 2); // grid width
-    // Set output dimensions for the descriptor setup
-    desc_dims[0] = (int) out_dims[0];
-    desc_dims[1] = (int) out_dims[1];
-    desc_dims[2] = (int) out_dims[2];
-    desc_dims[3] = (int) out_dims[3];
-    if ( out_dims[0] == 0 || out_dims[1] == 0 || out_dims[2] == 0 || out_dims[3] == 0 )
-    {
-        PyErr_SetString( PyExc_RuntimeError,
-            "GpuDnnTransformerSampler: one of the sampler dimensions is zero" );
-        return 1;
-    }
-    if ( aesara_prep_output( output, 4, out_dims, input->ga.typecode,
-                             GA_C_ORDER, gpu_ctx ) != 0 )
-    {
-        PyErr_SetString( PyExc_MemoryError,
-            "GpuDnnTransformerSampler: could not allocate memory for grid sampler" );
-        return 1;
-    }
-    // Currently, only the bilinear sampler is supported by cuDNN,
-    // so the sampler method is currently not available as a parameter
-    err = cudnnSetSpatialTransformerNdDescriptor(APPLY_SPECIFIC(sptf), CUDNN_SAMPLER_BILINEAR,
-        dt, 4, desc_dims );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_MemoryError,
-            "GpuDnnTransformerSampler: could not initialize descriptor: %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    if ( c_set_tensorNd( input, APPLY_SPECIFIC(xdesc) ) != 0 )
-        return 1;
-    if ( c_set_tensorNd( *output, APPLY_SPECIFIC(ydesc) ) != 0 )
-        return 1;
-    cuda_enter( gpu_ctx->ctx );
-    cuda_wait( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_wait( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    err = cudnnSpatialTfSamplerForward( _handle, APPLY_SPECIFIC(sptf), alpha_p,
-        APPLY_SPECIFIC(xdesc), PyGpuArray_DEV_DATA( input ), PyGpuArray_DEV_DATA( grid ),
-        beta_p, APPLY_SPECIFIC(ydesc), PyGpuArray_DEV_DATA( *output ) );
-    cuda_record( input->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( grid->ga.data, GPUARRAY_CUDA_WAIT_READ );
-    cuda_record( (*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE );
-    cuda_exit( gpu_ctx->ctx );
-    if ( CUDNN_STATUS_SUCCESS != err )
-    {
-        PyErr_Format( PyExc_RuntimeError,
-            "GpuDnnTransformerSampler: could not create grid sampler: %s",
-            cudnnGetErrorString( err ) );
-        return 1;
-    }
-    return 0;
-}
--- a/aesara/gpuarray/c_code/gpuarray_helper.h
+++ b/aesara/gpuarray/c_code/gpuarray_helper.h
-#ifndef AESARA_GPUARRAY_HELPER
-#define AESARA_GPUARRAY_HELPER
-#include <string.h>
-#include <gpuarray_api.h>
-#include <numpy_compat.h>
-#include <gpuarray/util.h>
-static int aesara_size_check(PyGpuArrayObject *a, unsigned int nd,
-                             const size_t *dims, int typecode) {
-  return (a->ga.nd == nd && a->ga.typecode == typecode &&
-          memcmp(a->ga.dimensions, dims, nd * sizeof(size_t)) == 0);
-}
-static int aesara_prep_output(PyGpuArrayObject **out, unsigned int nd,
-                             const size_t *dims, int typecode, ga_order ord,
-                             PyGpuContextObject *c) {
-  if (*out != NULL &&
-      aesara_size_check(*out, nd, dims, typecode)) {
-    return 0;
-  }
-  Py_XDECREF(*out);
-  *out = pygpu_empty(nd, dims, typecode, ord, c, Py_None);
-  return (*out == NULL) ? 1 : 0;
-}
-static PyGpuArrayObject *aesara_try_copy(PyGpuArrayObject *out,
-                                         PyGpuArrayObject *V) {
-  if (out &&
-      GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
-      aesara_size_check(out, PyGpuArray_NDIM(V),
-                        PyGpuArray_DIMS(V),
-                        V->ga.typecode)) {
-    if (pygpu_move(out, V)) {
-      Py_XDECREF(out);
-      return NULL;
-    }
-  } else {
-    Py_XDECREF(out);
-    out = pygpu_copy(V, GA_C_ORDER);
-  }
-  return out;
-}
-static inline void *PyGpuArray_DEV_DATA(PyGpuArrayObject *a) {
-  /* This is guaranteed to work and return the raw CUDA/OpenCL object on
-   * all recent (as of June 2015) version of libgpuarray. This is also
-   * promised to keep working in future versions. */
-  char * p = *((char **)a->ga.data);
-  /* This only works on cuda since we have a real pointer. */
-  return (void *)(p + a->ga.offset);
-}
-#endif
--- a/aesara/gpuarray/c_code/magma_cholesky.c
+++ b/aesara/gpuarray/c_code/magma_cholesky.c
-#section kernels
-#kernel tril_kernel : size, size, size, *:
-#include "cluda.h"
-KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
-                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
-  a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    unsigned int ix = index / ncols;
-    unsigned int iy = index % ncols;
-    if (ix < iy) {
-      a[index] = 0.0;
-    }
-  }
-}
-#kernel triu_kernel : size, size, size, *:
-#include "cluda.h"
-KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
-                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
-  a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    unsigned int ix = index / ncols;
-    unsigned int iy = index % ncols;
-    if (ix > iy) {
-      a[index] = 0.0;
-    }
-  }
-}
-#section init_code
-setup_ext_cuda();
-#section support_code_struct
-int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
-                                   PARAMS_TYPE *params) {
-  const size_t *dims;
-  size_t N, n2;
-  magma_uplo_t ul;
-  int res = -1, info;
-  if (A->ga.typecode != GA_FLOAT) {
-    PyErr_SetString(PyExc_TypeError,
-                    "GpuMagmaCholesky: unsupported data type");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(params->context->ctx);
-  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaCholesky: requires data to be C-contiguous");
-    goto fail;
-  }
-  if (PyGpuArray_NDIM(A) != 2) {
-    PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
-    goto fail;
-  }
-  dims = PyGpuArray_DIMS(A);
-  if (dims[0] != dims[1]) {
-    PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
-    goto fail;
-  }
-  if (params->inplace) {
-    Py_XDECREF(*L);
-    *L = A;
-    Py_INCREF(*L);
-  } else {
-    *L = aesara_try_copy(*L, A);
-    if (*L == NULL) {
-      PyErr_SetString(
-          PyExc_RuntimeError,
-          "GpuMagmaCholesky: failed to allocate memory for the output");
-      goto fail;
-    }
-  }
-  // magma matrix cholesky
-  N = dims[0];
-  n2 = N * N;
-// Magma requires column-major order for the matrix A. Instead of changing
-// matrix order which requires copying data, we can compute cholesky
-// decomposition where we change parameters lower to upper and upper to
-// lower.
-  if (params->lower) {
-    ul = MagmaUpper;
-  }
-  else {
-    ul = MagmaLower;
-  }
-  magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
-  if (info > 0) {
-    PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
-                                     "order %d is not positive definite",
-                 info);
-    goto fail;
-  } else if (info < 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
-        -info);
-    goto fail;
-  }
-  if (params->lower) {
-    res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
-    if (res != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
-                   GpuKernel_error(&k_tril_kernel, res));
-      goto fail;
-    }
-  } else {
-    res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
-    if (res != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
-                   GpuKernel_error(&k_triu_kernel, res));
-      goto fail;
-    }
-  }
-  res = 0;
-fail:
-  cuda_exit(params->context->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/magma_eigh.c
+++ b/aesara/gpuarray/c_code/magma_eigh.c
-#section init_code
-setup_ext_cuda();
-#section support_code_struct
-int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
-                               PyGpuArrayObject **D,
-                               PyGpuArrayObject **V, // may be NULL
-                               PARAMS_TYPE *params) {
-  PyGpuArrayObject *A = NULL;
-  magma_int_t N, liwork, *iwork_data = NULL;
-  size_t d_dims[1], v_dims[2];
-  magma_uplo_t uplo;
-  magma_vec_t jobz;
-  float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
-  int res = -1, info;
-  if (A_->ga.typecode != GA_FLOAT) {
-    PyErr_SetString(PyExc_TypeError,
-                    "GpuMagmaEigh: Unsupported data type");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(params->context->ctx);
-  if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaEigh: requires data to be C-contiguous");
-    goto fail;
-  }
-  if (PyGpuArray_NDIM(A_) != 2) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaEigh: matrix rank error");
-    goto fail;
-  }
-  if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaEigh: matrix is not square");
-    goto fail;
-  }
-  A = pygpu_copy(A_, GA_F_ORDER);
-  if (A == NULL) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaEigh: failed to change to column-major order");
-    return -1;
-  }
-  // magma matrix eigen decomposition of a symmetric matrix
-  N = PyGpuArray_DIM(A, 0);
-  if (params->lower) {
-    uplo = MagmaLower;
-  } else {
-    uplo = MagmaUpper;
-  }
-  if (params->compute_v) {
-    jobz = MagmaVec;
-  } else {
-    jobz = MagmaNoVec;
-  }
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaEigh: failed to allocate working memory");
-    goto fail;
-  }
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaEigh: failed to allocate working memory");
-    goto fail;
-  }
-  // query for workspace size
-  magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
-                   -1, &liwork, -1, &info);
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaEigh: failed to allocate working memory");
-    goto fail;
-  }
-  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaEigh: failed to allocate working memory");
-    goto fail;
-  }
-  magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
-                   wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
-                   &info);
-  if (info > 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaEigh:  %d off-diagonal elements of an didn't converge to zero",
-        info);
-    goto fail;
-  } else if (info < 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
-    goto fail;
-  }
-  d_dims[0] = N;
-  if (aesara_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaEigh: failed to allocate memory for the output");
-    goto fail;
-  }
-  cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-  if (params->compute_v) {
-    *V = aesara_try_copy(*V, A);
-    if (*V == NULL) {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMagmaEigh: failed to allocate memory for the output");
-      goto fail;
-    }
-  }
-  res = 0;
-fail:
-  if (w_data != NULL)
-    magma_free_pinned(w_data);
-  if (wA_data != NULL)
-    magma_free_pinned(wA_data);
-  if (work_data != NULL)
-    magma_free_pinned(work_data);
-  if (iwork_data != NULL)
-    magma_free_cpu(iwork_data);
-  Py_XDECREF(A);
-  cuda_exit(params->context->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/magma_inv.c
+++ b/aesara/gpuarray/c_code/magma_inv.c
-#section init_code
-setup_ext_cuda();
-#section support_code_struct
-int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
-                              PARAMS_TYPE *params) {
-  const size_t *dims;
-  magma_int_t N, ldwork, info;
-  magma_int_t *piv = NULL;
-  gpudata *dwork = NULL;
-  int res = -1;
-  if (A->ga.typecode != GA_FLOAT) {
-    PyErr_SetString(PyExc_TypeError,
-                    "GpuMagmaMatrixInverse: Unsupported data type");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(params->context->ctx);
-  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
-    goto fail;
-  }
-  if (PyGpuArray_NDIM(A) != 2) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaMatrixInverse: matrix rank error");
-    goto fail;
-  }
-  dims = PyGpuArray_DIMS(A);
-  if (dims[0] != dims[1]) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaMatrixInverse: matrix is not square");
-    goto fail;
-  }
-  if (params->inplace) {
-    Py_XDECREF(*A_inv);
-    *A_inv = A;
-    Py_INCREF(*A_inv);
-  } else {
-    *A_inv = aesara_try_copy(*A_inv, A);
-    if (*A_inv == NULL) {
-      PyErr_SetString(
-          PyExc_RuntimeError,
-          "GpuMagmaMatrixInverse: failed to allocate memory for the output");
-      goto fail;
-    }
-  }
-  // magma matrix inverse
-  N = dims[0];
-  ldwork = N * magma_get_sgetri_nb(N);
-  dwork = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
-  if (dwork == NULL) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaMatrixInverse: failed to allocate working memory");
-    goto fail;
-  }
-  if (magma_imalloc_cpu(&piv, N)) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        "GpuMagmaMatrixInverse: failed to allocate memory for the pivot array");
-    goto fail;
-  }
-  magma_sgetrf_gpu(N, N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv, &info);
-  if (info != 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaMatrixInverse: magma_sgetrf_gpu returned error %d: %s.", info,
-        magma_strerror(info));
-    goto fail;
-  }
-  magma_sgetri_gpu(N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv,
-                   *(float **)dwork, ldwork, &info);
-  if (info != 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaMatrixInverse: magma_sgetri_gpu returned error %d: %s.", info,
-        magma_strerror(info));
-    goto fail;
-  }
-  res = 0;
-fail:
-  if (piv != NULL)
-    magma_free(piv);
-  if (dwork != NULL)
-    gpudata_release(dwork);
-  cuda_exit(params->context->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/magma_qr.c
+++ b/aesara/gpuarray/c_code/magma_qr.c
-#section kernels
-#kernel triu_kernel : size, size, size, *:
-#include "cluda.h"
-KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
-                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
-  a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size ix = index / ncols;
-    const ga_size iy = index % ncols;
-    if (ix > iy) {
-      a[index] = 0.0;
-    }
-  }
-}
-#section init_code
-setup_ext_cuda();
-#section support_code
-static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
-                                      size_t size) {
-  PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
-  src_view->ga.dimensions[dim] = size;
-  GpuArray_fix_flags(&src_view->ga);
-  return pygpu_copy(src_view, GA_C_ORDER);
-}
-#section support_code_struct
-int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
-                             PyGpuArrayObject **R,
-                             PyGpuArrayObject **Q, // may be NULL
-                             PARAMS_TYPE* params) {
-  PyGpuArrayObject *A = NULL;
-  magma_int_t M, N, K, nb, ldwork;
-  size_t n2;
-  float *tau_data = NULL;
-  gpudata *work_data = NULL;
-  int res = -1, info;
-  A = A_;
-  if (A->ga.typecode != GA_FLOAT) {
-    PyErr_SetString(PyExc_TypeError,
-                    "GpuMagmaQR: Unsupported data type");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(params->context->ctx);
-  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaQR: requires data to be C-contiguous");
-    goto fail;
-  }
-  if (PyGpuArray_NDIM(A) != 2) {
-    PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
-    goto fail;
-  }
-  A = pygpu_copy(A_, GA_F_ORDER);
-  if (A == NULL) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaQR: failed to change to column-major order");
-    goto fail;
-  }
-  // magma matrix qr
-  M = PyGpuArray_DIM(A, 0);
-  N = PyGpuArray_DIM(A, 1);
-  K = M < N ? M : N;
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaQR: failed to allocate working memory");
-    goto fail;
-  }
-  nb = magma_get_sgeqrf_nb(M, N);
-  ldwork = (2 * K + magma_roundup(N, 32)) * nb;
-  work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
-  if (work_data == NULL) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaQR: failed to allocate working memory");
-    goto fail;
-  }
-  // compute R
-  magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
-  if (info != 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
-    goto fail;
-  }
-  *R = pygpu_narrow(A, 0, K);
-  if (*R == NULL) {
-    PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
-    goto fail;
-  }
-  n2 = K * N;
-  res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
-  if (res != GA_NO_ERROR) {
-    PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
-                 GpuKernel_error(&k_triu_kernel, res));
-    goto fail;
-  }
-  if (params->complete) {
-    // compute Q
-    Py_XDECREF(A);
-    A = pygpu_copy(A_, GA_F_ORDER);
-    if (A == NULL) {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMagmaQR: failed to change to column-major order");
-      return -1;
-    }
-    magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
-                     *(float **)work_data, &info);
-    if (info != 0) {
-      PyErr_Format(
-                   PyExc_RuntimeError,
-                   "GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
-      goto fail;
-    }
-    magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
-                     *(float **)work_data, nb, &info);
-    if (info != 0) {
-      PyErr_Format(
-                   PyExc_RuntimeError,
-                   "GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
-      goto fail;
-    }
-    *Q = pygpu_narrow(A, 1, K);
-    if (*Q == NULL) {
-      PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
-      goto fail;
-    }
-  }
-  res = 0;
-fail:
-  if (tau_data != NULL)
-    magma_free_pinned(tau_data);
-  if (work_data != NULL)
-    gpudata_release(work_data);
-  Py_XDECREF(A);
-  cuda_exit(params->context->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/magma_svd.c
+++ b/aesara/gpuarray/c_code/magma_svd.c
-#section init_code
-setup_ext_cuda();
-#section support_code_struct
-int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
-                              PyGpuArrayObject **S,
-                              PyGpuArrayObject **U, // may be NULL
-                              PyGpuArrayObject **VT, // may be NULL
-                              PARAMS_TYPE *params) {
-  bool compute_uv = (U != NULL);
-  magma_int_t *iwork = NULL, iunused[1];
-  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
-  magma_vec_t jobz;
-  size_t s_dims[1], u_dims[2], vt_dims[2];
-  float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
-        *work = NULL;
-  float dummy[1];
-  int res = -1, lwork;
-  if (A->ga.typecode != GA_FLOAT) {
-    PyErr_SetString(PyExc_TypeError,
-                    "GpuMagmaSVD: Unsupported data type");
-    return -1;
-  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(params->context->ctx);
-  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaSVD: requires data to be C-contiguous");
-    goto fail;
-  }
-  if (PyGpuArray_NDIM(A) != 2) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaSVD: matrix rank error");
-    goto fail;
-  }
-  // magma matrix svd
-  // reverse dimensions because MAGMA expects column-major matrices:
-  M = PyGpuArray_DIM(A, 1);
-  N = PyGpuArray_DIM(A, 0);
-  K = M < N ? M : N;
-  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-  cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&s_data, K)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-  if (compute_uv) {
-    if (params->full_matrices) {
-      jobz = MagmaAllVec;
-    } else {
-      jobz = MagmaSomeVec;
-    }
-    M_U  = (jobz == MagmaAllVec ? M : K);
-    N_VT = (jobz == MagmaAllVec ? N : K);
-    ldu = M;
-    ldv = N_VT;
-    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMagmaSVD: failed to allocate memory");
-      goto fail;
-    }
-    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMagmaSVD: failed to allocate memory");
-      goto fail;
-    }
-  } else {
-    jobz = MagmaNoVec;
-    ldu = M;
-    ldv = N;
-  }
-  // query for workspace size
-  magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
-               dummy, -1, iunused, &info);
-  lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate working memory");
-    goto fail;
-  }
-  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate working memory");
-    goto fail;
-  }
-  // compute svd
-  magma_sgesdd(jobz, M, N, a_data, M, s_data,
-               u_data, ldu, vt_data, ldv, work, lwork, iwork, &info);
-  if (info > 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)",
-        info);
-    goto fail;
-  } else if (info < 0) {
-    PyErr_Format(
-        PyExc_RuntimeError,
-        "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info);
-    goto fail;
-  }
-  s_dims[0] = K;
-  if (aesara_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-  if (compute_uv) {
-    u_dims[0] = N; u_dims[1] = N_VT;
-    if (aesara_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMagmaSVD: failed to allocate memory");
-      goto fail;
-    }
-    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
-    // to match numpy.linalg.svd output
-    cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
-               cudaMemcpyDeviceToDevice);
-    vt_dims[0] = M_U; vt_dims[1] = M;
-    if (aesara_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMagmaSVD: failed to allocate memory");
-      goto fail;
-    }
-    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
-    // to match numpy.linalg.svd output
-    cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
-               cudaMemcpyDeviceToDevice);
-  }
-  res = 0;
-fail:
-  if (a_data != NULL)
-    magma_free_pinned(a_data);
-  if (s_data != NULL)
-    magma_free_pinned(s_data);
-  if (u_data != NULL)
-    magma_free_pinned(u_data);
-  if (vt_data != NULL)
-    magma_free_pinned(vt_data);
-  if (work != NULL)
-    magma_free_pinned(work);
-  if (iwork != NULL)
-    magma_free_cpu(iwork);
-  cuda_exit(params->context->ctx);
-  return res;
-}
--- a/aesara/gpuarray/c_code/pool.c
+++ b/aesara/gpuarray/c_code/pool.c
-#section kernels
-#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void max_pool2d_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_height,
-   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size c = (index / pooled_width / pooled_height) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / channels);
-    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
-    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
-    const ga_size wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    const ga_size offset = (n*channels + c) * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
-    for (ga_size h=hstart; h < hend; ++h) {
-      for (ga_size w=wstart; w < wend; ++w) {
-        // maximum in the region
-        if (x_slice[h*width + w] > maxval) {
-          maxval = x_slice[h*width + w];
-        }
-      }
-    }
-    z[index] = maxval;
-  }
-}
-#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void max_pool3d_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_depth,
-   const ga_size pooled_height, const ga_size pooled_width,
-   const ga_size depth, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
-   const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
-   const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
-    ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
-    const ga_size dend = min(dstart + kernel_d, depth);
-    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
-    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
-    const ga_size wend = min(wstart + kernel_w, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    const ga_size offset = (n*channels + c) * depth * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
-    for (ga_size d=dstart; d < dend; ++d) {
-      for (ga_size h=hstart; h < hend; ++h) {
-        for (ga_size w=wstart; w < wend; ++w) {
-          // maximum in the region
-          if (x_slice[(d*height + h)*width + w] > maxval) {
-            maxval = x_slice[(d*height + h)*width + w];
-          }
-        }
-      }
-    }
-    z[index] = maxval;
-  }
-}
-#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void ave_pool2d_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_height,
-   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
-   const ga_bool inc_pad, const ga_bool sum_mode,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size c = (index / pooled_width / pooled_height) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / channels);
-    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
-    ga_size hend = min(hstart + kernel_h, height + pad_h);
-    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
-    ga_size wend = min(wstart + kernel_w, width + pad_w);
-    ga_size pool_size;
-    if (inc_pad) {
-      pool_size = (hend - hstart) * (wend - wstart);
-    }
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    if (!inc_pad) {
-      pool_size = (hend - hstart) * (wend - wstart);
-    }
-    const ga_size offset = (n*channels + c) * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    DTYPE_OUTPUT_0 collector = 0;
-    for (ga_size h=hstart; h < hend; ++h) {
-      for (ga_size w=wstart; w < wend; ++w) {
-        collector += x_slice[h * width + w];
-      }
-    }
-    if (sum_mode) {
-      z[index] = collector;
-    }
-    else {
-      z[index] = collector / pool_size;
-    }
-  }
-}
-#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void ave_pool3d_kernel(const ga_size nthreads,
-                              const ga_size num, const ga_size channels, const ga_size pooled_depth,
-                              const ga_size pooled_height, const ga_size pooled_width,
-                              const ga_size depth, const ga_size height, const ga_size width,
-                              GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
-                              const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
-                              const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-                              const ga_bool inc_pad, const ga_bool sum_mode,
-                              GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
-{
-  // grid stride looping
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)z) + z_off);
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
-    ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
-    ga_size dend = min(dstart + kernel_d, depth + pad_d);
-    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
-    ga_size hend = min(hstart + kernel_h, height + pad_h);
-    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
-    ga_size wend = min(wstart + kernel_w, width + pad_w);
-    ga_size pool_size;
-    if (inc_pad) {
-      pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-    }
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    dend = min(dend, depth);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    if (!inc_pad) {
-      pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-    }
-    const ga_size offset = (n*channels + c) * depth * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    DTYPE_OUTPUT_0 collector = 0;
-    for (ga_size d=dstart; d < dend; ++d) {
-      for (ga_size h=hstart; h < hend; ++h) {
-        for (ga_size w=wstart; w < wend; ++w) {
-          collector += x_slice[(d * height + h) * width + w];
-        }
-      }
-    }
-    if (sum_mode) {
-      z[index] = collector;
-    }
-    else {
-      z[index] = collector / pool_size;
-    }
-  }
-}
-#section support_code
-// output shape for a given input padded shape, window shape and stride
-// We use ssize_t in the max since this is done to avoid negative results.
-#define OUTPUT_DIMS(in_dim, ws, st, ignore_border)        \
-  (ignore_border ? (in_dim - ws)/st + 1 :                 \
-   (st > ws ? (in_dim - 1)/st + 1 :                       \
-    std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))
-#section support_code_struct
-int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
-                         PyArrayObject *ws,
-                         PyArrayObject *stride,
-                         PyArrayObject *pad,
-                         PyGpuArrayObject **z,
-                         PARAMS_TYPE* params) {
-  bool max_pool = (params->mode == POOLING_MAX);
-  bool inc_pad = (params->mode != POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
-  bool sum_mode = (params->mode  == POOLING_SUM);
-  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuPool: requires data to be C-contiguous");
-      return 1;
-    }
-  size_t ndims = PyArray_DIM(ws, 0);
-  if (PyGpuArray_NDIM(x) != ndims + 2)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuPool: rank error");
-      return 1;
-    }
-  // prepare output
-  const size_t* x_dims = PyGpuArray_DIMS(x);
-  size_t z_dims[5]; // avoid warning if use 2 + nd
-  size_t w[3];
-  size_t s[3];
-  size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
-  int nonzero_padding = 0;
-  for (int i = 0; i < ndims; i++) {
-    w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
-    s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
-    p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
-    if (p[i] > 0) {
-      nonzero_padding = 1;
-    }
-  }
-  if (!params->ignore_border && nonzero_padding) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuPool: padding works only with ignore_border=True");
-    return 1;
-  }
-  if (aesara_prep_output(z, PyGpuArray_NDIM(x), z_dims,
-                         x->ga.typecode, GA_C_ORDER, params->context) != 0)
-    {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuPool: failed to allocate memory");
-      return 1;
-    }
-  {
-    // scope for running kernel
-    int err;
-    if (ndims == 2) {
-      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
-      if (max_pool) {
-        err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3],
-                                      x_dims[2], x_dims[3],
-                                      x->ga.data, x->ga.offset, w[0], w[1], s[0], s[1], p[0], p[1],
-                                      (*z)->ga.data, (*z)->ga.offset);
-        if (err != GA_NO_ERROR) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuPool: max_pool2d_kernel %s.",
-                       GpuKernel_error(&k_max_pool2d_kernel, err));
-          return 1;
-        }
-      } else {
-        err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3],
-                                      x_dims[2], x_dims[3],
-                                      x->ga.data, x->ga.offset,
-                                      w[0], w[1], s[0], s[1], p[0], p[1],
-                                      inc_pad, sum_mode,
-                                      (*z)->ga.data, (*z)->ga.offset);
-        if (err != GA_NO_ERROR) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuPool: ave_pool2d_kernel %s.",
-                       GpuKernel_error(&k_ave_pool2d_kernel, err));
-          return 1;
-        }
-      }
-    }
-    else if (ndims == 3) {
-      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
-      if (max_pool) {
-        err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
-                                      x_dims[2], x_dims[3], x_dims[4],
-                                      x->ga.data, x->ga.offset, w[0], w[1], w[2], s[0], s[1], s[2],
-                                      p[0], p[1], p[2], (*z)->ga.data, (*z)->ga.offset);
-        if (err != GA_NO_ERROR) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuPool: max_pool3d_kernel %s.",
-                       GpuKernel_error(&k_max_pool2d_kernel, err));
-          return 1;
-        }
-      } else {
-        err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
-                                      x_dims[2], x_dims[3], x_dims[4],
-                                      x->ga.data, x->ga.offset,
-                                      w[0], w[1], w[2], s[0], s[1], s[2],
-                                      p[0], p[1], p[2],
-                                      inc_pad, sum_mode,
-                                      (*z)->ga.data, (*z)->ga.offset);
-        if (err != GA_NO_ERROR) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuPool: ave_pool3d_kernel %s.",
-                       GpuKernel_error(&k_ave_pool3d_kernel, err));
-          return 1;
-        }
-      }
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/pool_ave_grad.c
+++ b/aesara/gpuarray/c_code/pool_ave_grad.c
-#section kernels
-#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size height,
-   const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
-   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
-{
-  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)gz) + gz_off);
-  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads; index += LDIM_0 * GDIM_0) {
-    const ga_size w = index % width;
-    const ga_size h = (index / width) % height;
-    const ga_size c = (index / width / height) % channels;
-    const ga_size n = (index / width / height / channels);
-    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
-    GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
-    DTYPE_OUTPUT_0 collector = 0;
-    for (ga_size ph=phstart; ph < phend; ++ph) {
-      for (ga_size pw=pwstart; pw < pwend; ++pw) {
-        if (sum_mode) {
-          collector += gz[ph*pooled_width + pw];
-        } else {
-          // figure out the pooling size
-          const ga_size hstart = ph * stride_h - pad_h;
-          const ga_size wstart = pw * stride_w - pad_w;
-          const ga_size hend = min(hstart + kernel_h, height + pad_h);
-          const ga_size wend = min(wstart + kernel_w, width + pad_w);
-          const ga_size pool_size = (hend - hstart) * (wend - wstart);
-          collector += gz_slice[ph*pooled_width + pw] / pool_size;
-        }
-      }
-    }
-    gx[index] = collector;
-  }
-}
-#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size depth,
-   const ga_size height, const ga_size width, const ga_size pooled_depth,
-   const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
-   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
-{
-  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)gz) + gz_off);
-  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads; index += LDIM_0 * GDIM_0) {
-    const ga_size w = index % width;
-    const ga_size h = (index / width) % height;
-    const ga_size d = (index / width / height) % depth;
-    const ga_size c = (index / width / height / depth) % channels;
-    const ga_size n = (index / width / height / depth / channels);
-    const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
-    const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
-    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
-    GLOBAL_MEM const DTYPE_INPUT_1* gz_slice = gz + offset;
-    DTYPE_OUTPUT_0 collector = 0;
-    for (ga_size pd=pdstart; pd < pdend; ++pd) {
-      for (ga_size ph=phstart; ph < phend; ++ph) {
-        for (ga_size pw=pwstart; pw < pwend; ++pw) {
-          if (sum_mode) {
-            collector += gz[ph*pooled_width + pw];
-          } else {
-            // figure out the pooling size
-            const ga_size dstart = pd * stride_d - pad_d;
-            const ga_size hstart = ph * stride_h - pad_h;
-            const ga_size wstart = pw * stride_w - pad_w;
-            const ga_size dend = min(dstart + kernel_d, depth + pad_d);
-            const ga_size hend = min(hstart + kernel_h, height + pad_h);
-            const ga_size wend = min(wstart + kernel_w, width + pad_w);
-            const ga_size pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            collector += gz[ph*pooled_width + pw] / pool_size;
-          }
-        }
-      }
-    }
-    gx[index] = collector;
-  }
-}
-#section support_code_struct
-int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
-                                  PyGpuArrayObject *gz,
-                                  PyArrayObject *ws,
-                                  PyArrayObject *stride,
-                                  PyArrayObject *pad,
-                                  PyGpuArrayObject **gx,
-                                  PARAMS_TYPE* params) {
-  bool inc_pad = (params->mode == POOLING_AVERAGE_COUNT_INCLUDE_PADDING);
-  bool sum_mode = (params->mode == POOLING_SUM);
-  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
-      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuMaxPoolGrad: requires data to be C-contiguous");
-      return 1;
-    }
-  size_t ndims = PyArray_DIM(ws, 0);
-  if (PyGpuArray_NDIM(x) != ndims + 2
-      || PyGpuArray_NDIM(gz) != ndims + 2)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
-      return 1;
-    }
-  if (aesara_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
-                         x->ga.typecode, GA_C_ORDER, params->context) != 0)
-    {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMaxPoolGrad: failed to allocate memory");
-      return 1;
-    }
-  {
-    // scope for running kernel
-    size_t w[3];
-    size_t s[3];
-    size_t p[3];
-    for(int i = 0; i < ndims; i++) {
-      w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
-      s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
-      p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    }
-    int err;
-    const size_t* z_dims = PyGpuArray_DIMS(gz);
-    const size_t* x_dims = PyGpuArray_DIMS(x);
-    if (ndims == 2) {
-      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
-      err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
-                                         z_dims[2], z_dims[3],
-                                         x->ga.data, x->ga.offset,
-                                         gz->ga.data, gz->ga.offset,
-                                         w[0], w[1], s[0], s[1], p[0], p[1],
-                                         inc_pad, sum_mode,
-                                         (*gx)->ga.data, (*gx)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
-                     GpuKernel_error(&k_ave_pool2d_grad_kernel, err));
-        return 1;
-      }
-    } else if (ndims == 3) {
-      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
-      err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
-                                         z_dims[2], z_dims[3], z_dims[4],
-                                         x->ga.data, x->ga.offset,
-                                         gz->ga.data, gz->ga.offset,
-                                         w[0], w[1], w[2], s[0], s[1], s[2],
-                                         p[0], p[1], p[2], inc_pad, sum_mode,
-                                         (*gx)->ga.data, (*gx)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",
-                     GpuKernel_error(&k_ave_pool3d_grad_kernel, err));
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/pool_grad_grad.c
+++ b/aesara/gpuarray/c_code/pool_grad_grad.c
-#section kernels
-#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_height,
-   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
-   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
-  gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gx) + gx_off);
-  gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gz) + gz_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads; index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size c = (index / pooled_width / pooled_height) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / channels);
-    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
-    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
-    const ga_size wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    const ga_size offset = (n*channels + c) * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
-    DTYPE_OUTPUT_0 gradient = 0;
-    for (ga_size h=hstart; h < hend; ++h) {
-      for (ga_size w=wstart; w < wend; ++w) {
-        // maximum in the region
-        if (z[index] == x_slice[h * width + w]) {
-          gradient += gx_slice[h * width + w];
-        }
-      }
-    }
-    gz[index] = gradient;
-  }
-}
-#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_depth,
-   const ga_size pooled_height, const ga_size pooled_width,
-   const ga_size depth, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
-   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
-  gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gx) + gx_off);
-  gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gz) + gz_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads; index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
-    ga_int dstart = (ga_int)(pd*stride_d) - (ga_int)(pad_d);
-    const ga_size dend = min(dstart + kernel_d, depth);
-    ga_int hstart = (ga_int)(ph*stride_h) - (ga_int)(pad_h);
-    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = (ga_int)(pw*stride_w) - (ga_int)(pad_w);
-    const ga_size wend = min(wstart + kernel_w, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    const ga_size offset = (n*channels + c) * depth * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    GLOBAL_MEM const DTYPE_INPUT_2* gx_slice = gx + offset;
-    DTYPE_OUTPUT_0 gradient = 0;
-    for (ga_size d=dstart; d < dend; ++d) {
-      for (ga_size h=hstart; h < hend; ++h) {
-        for (ga_size w=wstart; w < wend; ++w) {
-          // maximum in the region
-          if (z[index] == x_slice[(d * height + h) * width + w]) {
-            gradient += gx_slice[(d * height + h)* width + w];
-          }
-        }
-      }
-    }
-    gz[index] = gradient;
-  }
-}
-#section support_code_struct
-int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
-                                   PyGpuArrayObject *z,
-                                   PyGpuArrayObject *gx,
-                                   PyArrayObject *ws,
-                                   PyArrayObject *stride,
-                                   PyArrayObject *pad,
-                                   PyGpuArrayObject **gz,
-                                   PyGpuContextObject *ctx) {
-  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
-      || !GpuArray_IS_C_CONTIGUOUS(&z->ga)
-      || !GpuArray_IS_C_CONTIGUOUS(&gx->ga))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuPoolingGradGrad: requires data to be C-contiguous");
-      return 1;
-    }
-  size_t ndims = PyArray_DIM(ws, 0);
-  if (PyGpuArray_NDIM(x) != ndims + 2
-      || PyGpuArray_NDIM(z) != ndims + 2
-      || PyGpuArray_NDIM(gx) != ndims + 2)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuPoolingGradGrad: rank error");
-      return 1;
-    }
-  if (aesara_prep_output(gz, PyGpuArray_NDIM(z), PyGpuArray_DIMS(z),
-                         z->ga.typecode, GA_C_ORDER, ctx) != 0)
-    {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuPoolingGradGrad: failed to allocate memory");
-      return 1;
-    }
-  {
-    // scope for running kernel
-    size_t w[3];
-    size_t s[3];
-    size_t p[3];
-    for(int i = 0; i < ndims; i++) {
-      w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
-      s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
-      p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    }
-    int err;
-    const size_t* z_dims = PyGpuArray_DIMS(z);
-    const size_t* x_dims = PyGpuArray_DIMS(x);
-    if (ndims == 2) {
-      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
-      err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                              z_dims[0], z_dims[1], z_dims[2], z_dims[3],
-                                              x_dims[2], x_dims[3],
-                                              x->ga.data, x->ga.offset,
-                                              z->ga.data, z->ga.offset,
-                                              gx->ga.data, gx->ga.offset,
-                                              w[0], w[1], s[0], s[1], p[0], p[1],
-                                              (*gz)->ga.data, (*gz)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
-                     GpuKernel_error(&k_max_pool2d_grad_grad_kernel, err));
-        return 1;
-      }
-    }
-    else if (ndims == 3) {
-      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
-      err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                              z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
-                                              x_dims[2], x_dims[3], x_dims[4],
-                                              x->ga.data, x->ga.offset,
-                                              z->ga.data, z->ga.offset,
-                                              gx->ga.data, gx->ga.offset,
-                                              w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
-                                              (*gz)->ga.data, (*gz)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",
-                     GpuKernel_error(&k_max_pool3d_grad_grad_kernel, err));
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/pool_max_grad.c
+++ b/aesara/gpuarray/c_code/pool_max_grad.c
-#section kernels
-#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size height,
-   const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
-   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
-{
-  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
-  gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gz) + gz_off);
-  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads; index += LDIM_0 * GDIM_0) {
-    const ga_size w = index % width;
-    const ga_size h = (index / width) % height;
-    const ga_size c = (index / width / height) % channels;
-    const ga_size n = (index / width / height / channels);
-    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    const ga_size offset = (n*channels + c) * pooled_height * pooled_width;
-    GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
-    GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
-    DTYPE_OUTPUT_0 gradient = 0;
-    for (ga_size ph=phstart; ph < phend; ++ph) {
-      for (ga_size pw=pwstart; pw < pwend; ++pw) {
-        if (x[index] == z_slice[ph * pooled_width + pw]) {
-          gradient += gz_slice[ph * pooled_width + pw];
-        }
-      }
-    }
-    gx[index] = gradient;
-  }
-}
-#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size depth,
-   const ga_size height, const ga_size width, const ga_size pooled_depth,
-   const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
-   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
-{
-  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((GLOBAL_MEM char *)x) + x_off);
-  z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((GLOBAL_MEM char *)z) + z_off);
-  gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((GLOBAL_MEM char *)gz) + gz_off);
-  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((GLOBAL_MEM char *)gx) + gx_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads; index += LDIM_0 * GDIM_0) {
-    const ga_size w = index % width;
-    const ga_size h = (index / width) % height;
-    const ga_size d = (index / width / height) % depth;
-    const ga_size c = (index / width / height / depth) % channels;
-    const ga_size n = (index / width / height / depth / channels);
-    const ga_size pdstart = (d + pad_d < kernel_d) ? 0 : (d + pad_d - kernel_d) / stride_d + 1;
-    const ga_size pdend = min((d + pad_d) / stride_d + 1, pooled_depth);
-    const ga_size phstart = (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const ga_size phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const ga_size pwstart = (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const ga_size pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    const ga_size offset = (n*channels + c) * pooled_depth * pooled_height * pooled_width;
-    GLOBAL_MEM const DTYPE_INPUT_1* z_slice = z + offset;
-    GLOBAL_MEM const DTYPE_INPUT_2* gz_slice = gz + offset;
-    DTYPE_OUTPUT_0 gradient = 0;
-    for (ga_size pd=pdstart; pd < pdend; ++pd) {
-      for (ga_size ph=phstart; ph < phend; ++ph) {
-        for (ga_size pw=pwstart; pw < pwend; ++pw) {
-          if (x[index] == z_slice[(pd * pooled_height + ph) * pooled_width + pw]) {
-            gradient += gz_slice[(pd * pooled_height + ph) * pooled_width + pw];
-          }
-        }
-      }
-    }
-    gx[index] = gradient;
-  }
-}
-#section support_code_struct
-int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
-                                  PyGpuArrayObject *z,
-                                  PyGpuArrayObject *gz,
-                                  PyArrayObject *ws,
-                                  PyArrayObject *stride,
-                                  PyArrayObject *pad,
-                                  PyGpuArrayObject **gx,
-                                  PyGpuContextObject *ctx) {
-  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
-      || !GpuArray_IS_C_CONTIGUOUS(&z->ga)
-      || !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuMaxPoolGrad: requires data to be C-contiguous");
-      return 1;
-    }
-  size_t ndims = PyArray_DIM(ws, 0);
-  if (PyGpuArray_NDIM(x) != ndims + 2
-      || PyGpuArray_NDIM(z) != ndims + 2
-      || PyGpuArray_NDIM(gz) != ndims + 2)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolGrad: rank error");
-      return 1;
-    }
-  if (aesara_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
-                         x->ga.typecode, GA_C_ORDER, ctx) != 0)
-    {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMaxPoolGrad: failed to allocate memory");
-      return 1;
-    }
-  {
-    // scope for running kernel
-    size_t w[3];
-    size_t s[3];
-    size_t p[3];
-    for(int i = 0; i < ndims; i++) {
-      w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
-      s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
-      p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    }
-    int err;
-    const size_t* z_dims = PyGpuArray_DIMS(z);
-    const size_t* x_dims = PyGpuArray_DIMS(x);
-    if (ndims == 2) {
-      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3];
-      err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
-                                         z_dims[2], z_dims[3],
-                                         x->ga.data, x->ga.offset,
-                                         z->ga.data, z->ga.offset,
-                                         gz->ga.data, gz->ga.offset,
-                                         w[0], w[1], s[0], s[1], p[0], p[1],
-                                         (*gx)->ga.data, (*gx)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
-                     GpuKernel_error(&k_max_pool2d_grad_kernel, err));
-        return 1;
-      }
-    } else if (ndims == 3) {
-      size_t num_kernels = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3] * x_dims[4];
-      err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
-                                         z_dims[2], z_dims[3], z_dims[4],
-                                         x->ga.data, x->ga.offset,
-                                         z->ga.data, z->ga.offset,
-                                         gz->ga.data, gz->ga.offset,
-                                         w[0], w[1], w[2], s[0], s[1], s[2],
-                                         p[0], p[1], p[2], (*gx)->ga.data, (*gx)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",
-                     GpuKernel_error(&k_max_pool3d_grad_kernel, err));
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/pool_max_rop.c
+++ b/aesara/gpuarray/c_code/pool_max_rop.c
-#section kernels
-#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_height,
-   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
-   const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((GLOBAL_MEM char *)x) + x_off);
-  ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((GLOBAL_MEM char *)ex) + ex_off);
-  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((GLOBAL_MEM char *)z) + z_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size c = (index / pooled_width / pooled_height) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / channels);
-    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
-    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
-    const ga_size wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    const ga_size offset = (n*channels + c) * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
-    DTYPE_OUTPUT_0 maxval = x_slice[hstart*width + wstart];
-    DTYPE_OUTPUT_0 collector = ex_slice[hstart*width + wstart];
-    for (ga_size h=hstart; h < hend; ++h) {
-      for (ga_size w=wstart; w < wend; ++w) {
-        // maximum in the region
-        if (x_slice[h*width + w] > maxval) {
-          maxval = x_slice[h*width + w];
-          collector = ex_slice[h*width + w];
-        }
-      }
-    }
-    z[index] = collector;
-  }
-}
-#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
-#include "cluda.h"
-// (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
-KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
-   const ga_size num, const ga_size channels, const ga_size pooled_depth,
-   const ga_size pooled_height, const ga_size pooled_width,
-   const ga_size depth, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
-   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
-   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size x_off)
-{
-  x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((GLOBAL_MEM char *)x) + x_off);
-  ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((GLOBAL_MEM char *)ex) + ex_off);
-  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((GLOBAL_MEM char *)z) + z_off);
-  // grid stride looping
-  for (ga_size index = GID_0 * LDIM_0 + LID_0;
-       index < nthreads;
-       index += LDIM_0 * GDIM_0) {
-    const ga_size pw = index % pooled_width;
-    const ga_size ph = (index / pooled_width) % pooled_height;
-    const ga_size pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const ga_size c = (index / pooled_width / pooled_height / pooled_depth) % channels;
-    const ga_size n = (index / pooled_width / pooled_height / pooled_depth / channels);
-    ga_int dstart = static_cast<ga_int>(pd*stride_d) - static_cast<ga_int>(pad_d);
-    const ga_size dend = min(dstart + kernel_d, depth);
-    ga_int hstart = static_cast<ga_int>(ph*stride_h) - static_cast<ga_int>(pad_h);
-    const ga_size hend = min(hstart + kernel_h, height);
-    ga_int wstart = static_cast<ga_int>(pw*stride_w) - static_cast<ga_int>(pad_w);
-    const ga_size wend = min(wstart + kernel_w, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    const ga_size offset = (n*channels + c) * depth * height * width;
-    GLOBAL_MEM const DTYPE_INPUT_0* x_slice = x + offset;
-    GLOBAL_MEM const DTYPE_INPUT_1* ex_slice = ex + offset;
-    DTYPE_OUTPUT_0 maxval = x_slice[(dstart*height + hstart)*width + wstart];
-    DTYPE_OUTPUT_0 collector = ex_slice[(dstart*height + hstart)*width + wstart];
-    for (ga_size d=dstart; d < dend; ++d) {
-      for (ga_size h=hstart; h < hend; ++h) {
-        for (ga_size w=wstart; w < wend; ++w) {
-          // maximum in the region
-          if (x_slice[(d*height + h)*width + w] > maxval) {
-            maxval = x_slice[(d*height + h)*width + w];
-            collector = ex_slice[(d*height + h)*width + w];
-          }
-        }
-      }
-    }
-    z[index] = collector;
-  }
-}
-#section support_code
-// output shape for a given input padded shape, window shape and stride
-#define OUTPUT_DIMS(in_dim, ws, st, ignore_border)        \
-  (ignore_border ? (in_dim - ws)/st + 1 :                 \
-   (st > ws ? (in_dim - 1)/st + 1 :                       \
-    std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))
-#section support_code_struct
-int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
-                                 PyGpuArrayObject *ex,
-                                 PyArrayObject *ws,
-                                 PyArrayObject *stride,
-                                 PyArrayObject *pad,
-                                 PyGpuArrayObject **z,
-                                 PARAMS_TYPE* params) {
-  if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
-    {
-      PyErr_Format(PyExc_ValueError,
-                   "GpuMaxPoolRop: requires data to be C-contiguous");
-      return 1;
-    }
-  size_t ndims = PyArray_DIM(ws, 0);
-  if (PyGpuArray_NDIM(x) != ndims + 2 || PyGpuArray_NDIM(ex) != ndims + 2)
-    {
-      PyErr_SetString(PyExc_ValueError, "GpuMaxPoolRop: rank error");
-      return 1;
-    }
-  // prepare output
-  const size_t* x_dims = PyGpuArray_DIMS(x);
-  size_t z_dims[5]; // avoid warning if use 2 + nd
-  size_t w[3];
-  size_t s[3];
-  size_t p[3]; z_dims[0] = x_dims[0]; z_dims[1] = x_dims[1];
-  int nonzero_padding = 0;
-  for (int i = 0; i < ndims; i++) {
-    w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
-    s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
-    p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
-    z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
-    if (p[i] > 0) {
-      nonzero_padding = 1;
-    }
-  }
-  if (!params->ignore_border && nonzero_padding) {
-    PyErr_SetString(PyExc_ValueError,
-                    "GpuMaxPoolRop: padding works only with ignore_border=True");
-    return 1;
-  }
-  if (aesara_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
-                         ex->ga.typecode, GA_C_ORDER, params->context) != 0)
-    {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "GpuMaxPoolRop: failed to allocate memory");
-      return 1;
-    }
-  {
-    // scope for running kernel
-    int err;
-    if (ndims == 2) {
-      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
-      err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3],
-                                        x_dims[2], x_dims[3],
-                                        x->ga.data, x->ga.offset,
-                                        ex->ga.data, ex->ga.offset,
-                                        w[0], w[1], s[0], s[1], p[0], p[1],
-                                        (*z)->ga.data, (*z)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
-                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
-        return 1;
-      }
-    }
-    else if (ndims == 3) {
-      size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
-      err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
-                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
-                                        x_dims[2], x_dims[3], x_dims[4],
-                                        x->ga.data, x->ga.offset,
-                                        ex->ga.data, ex->ga.offset,
-                                        w[0], w[1], w[2], s[0], s[1], s[2],
-                                        p[0], p[1], p[2],
-                                        (*z)->ga.data, (*z)->ga.offset);
-      if (err != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",
-                     GpuKernel_error(&k_max_pool2d_rop_kernel, err));
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
--- a/aesara/gpuarray/c_code/topk_common.cuh
+++ b/aesara/gpuarray/c_code/topk_common.cuh
-// modified from pytorch
-// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
-// original license below:
-/*
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-   and IDIAP Research Institute nor the names of its contributors may be
-   used to endorse or promote products derived from this software without
-   specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-*/
-#if __CUDA_ARCH__ < 350
-#define __ldg(ptr) (*(ptr))
-#endif
-typedef ptrdiff_t ssize_t;
-__device__ __forceinline__ int lane_id() {
-  int id;
-  asm("mov.s32 %0, %laneid;" : "=r"(id) );
-  return id;
-}
-__device__ __forceinline__ unsigned lane_mask_lt() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
-  return mask;
-}
-__device__ __forceinline__ unsigned lane_mask_le() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
-  return mask;
-}
-__device__ __forceinline__ unsigned lane_mask_gt() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
-  return mask;
-}
-__device__ __forceinline__ unsigned lane_mask_ge() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
-  return mask;
-}
-template <typename T>
-struct Bitfield {};
-template <>
-struct Bitfield<unsigned int> {
-  static __device__ __forceinline__
-  unsigned int get(unsigned int val, int pos, int len) {
-    unsigned int ret;
-    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
-    return ret;
-  }
-  static __device__ __forceinline__
-  unsigned int set(unsigned int val, unsigned int toInsert, int pos, int len) {
-    unsigned int ret;
-    asm("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
-    return ret;
-  }
-};
-template <>
-struct Bitfield<unsigned long long int> {
-  static __device__ __forceinline__
-  unsigned long long int get(unsigned long long int val, int pos, int len) {
-    unsigned long long int ret;
-    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-    return ret;
-  }
-  static __device__ __forceinline__
-  unsigned long long int set(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
-    unsigned long long int ret;
-    asm("bfi.b64 %0, %1, %2, %3, %4;" :
-        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
-    return ret;
-  }
-};
-template <typename T>
-struct RadixConfig {
-// Converts a type (maybe float) to an integer representation with the same
-// sorting; i.e., for floats f1, f2:
-// if f1 < f2 then convert(f1) < convert(f2)
-// We use this to enable radix selection of floating-point values.
-// This also gives a relative order for NaNs, but that's ok, as they
-// will all be adjacent
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(T v) {
-      return (RadixType)v;
-  }
-  static inline __device__ float deconvert(RadixType v) {
-      return (T)v;
-  }
-};
-template <>
-struct RadixConfig<float> {
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(float v) {
-    RadixType x = __float_as_int(v);
-    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
-    return (x ^ mask);
-  }
-  static inline __device__ float deconvert(RadixType v) {
-    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
-    return __int_as_float(v ^ mask);
-  }
-};
-template <>
-struct RadixConfig<double> {
-  typedef unsigned long long RadixType;
-  static inline __device__ RadixType convert(double v) {
-    RadixType x = __double_as_longlong(v);
-    RadixType mask = -((x >> 63)) | 0x8000000000000000;
-    return (x ^ mask);
-  }
-  static inline __device__ double deconvert(RadixType v) {
-    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
-    return __longlong_as_double(v ^ mask);
-  }
-};
-template <>
-struct RadixConfig<char> {
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(char v) {
-    return 128u + v;
-  }
-  static inline __device__ char deconvert(RadixType v) {
-    return v - 128;
-  }
-};
-// g++ makes difference between 'signed char' (ga_byte, int8) and 'char'.
-// Same code as for char.
-template <>
-struct RadixConfig<ga_byte> {
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(ga_byte v) {
-    return 128u + v;
-  }
-  static inline __device__ ga_byte deconvert(RadixType v) {
-    return v - 128;
-  }
-};
-template <>
-struct RadixConfig<short> {
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(short v) {
-    assert(sizeof(short) == 2);
-    return 32768u ^ v;
-  }
-  static inline __device__ short deconvert(RadixType v) {
-    return v - 32768;
-  }
-};
-template <>
-struct RadixConfig<int> {
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(int v) {
-    assert(sizeof(int) == 4);
-    return 2147483648u + v;
-  }
-  static inline __device__ int deconvert(RadixType v) {
-    return v - 2147483648u;
-  }
-};
-template <>
-struct RadixConfig<long long> {
-  typedef unsigned long long RadixType;
-  static inline __device__ RadixType convert(long long v) {
-    assert(sizeof(long long) == 8);
-    return 9223372036854775808ull + v;
-  }
-  static inline __device__ long long deconvert(RadixType v) {
-    return v - 9223372036854775808ull;
-  }
-};
-/* NB: This specialization for ga_half does know that ga_half is a struct with only one member of type ga_ushort.
- * So, if ga_half implementation changes, this code should change too.
- * TODO: Maybe should gpuarray provide abstract functions to manipulate ga_half internal structure? e.g:
- *   unsigned short ga_half2bits(ga_half value);
- *   ga_half ga_bits2half(unsigned short bits);
- */
-template <>
-struct RadixConfig<ga_half> {
-  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(ga_half v) {
-    RadixType mask = -(((RadixType)v.data >> 15)) | 0x8000;
-    return (v.data ^ mask);
-  }
-  static inline __device__ ga_half deconvert(RadixType v) {
-    RadixType mask = ((v >> 15) - 1) | 0x8000;
-    ga_half out = {(unsigned short)(v ^ mask)};
-    return out;
-  }
-};
-// $$inp_t should be replaced in c_code
-// we cannot use templated kernel because gpuarray API does not support it
-#define NDIM            $ndim
-#define INPUT_TYPE      $inp_t
-#define INDEX_TYPE      $out_t
-#define bitsof(T)       (sizeof(T)*8)
-#define radix_t         RadixConfig<INPUT_TYPE>::RadixType
-#define WRITE_VALUE     $write_value
-#define WRITE_INDEX     $write_index
-#if RADIX_SIZE > 32
-#error "RADIX_SIZE must be smaller than warp size (32)"
-#endif
-void __device__ atomicAdd(long long *dst, long long &src) {
-    atomicAdd(
-        reinterpret_cast<unsigned long long*>(dst),
-        reinterpret_cast<unsigned long long&>(src));
-}
-template <typename T>
-static inline __device__ T binary_cumsum(
-    int idx, int warp_id, T* smem, bool value) {
-    // cumsum within 1D thread block, which adds up `value` of all threads
-    // whose id is *no greater than* the current thread
-    // binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
-    // cumsum within warp
-    unsigned int warp_bits = __ballot(value);
-    T warp_sum = __popc(lane_mask_le() & warp_bits);
-    if (lane_id() == 0)
-        smem[warp_id] = __popc(warp_bits);
-    local_barrier();
-    // cumsum across warps in one thread
-    if (idx == 0) {
-        T sum = smem[0];
-        for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
-            sum += smem[i];
-            smem[i] = sum;
-        }
-    }
-    local_barrier();
-    // load the carry from the preceding warp
-    if (warp_id >= 1) {
-        warp_sum = warp_sum+smem[warp_id - 1];
-    }
-    return warp_sum;
-}
-template <typename T>
-static inline __device__ T binary_cumsum_exclusive(
-    int idx, int warp_id, T* smem, bool value) {
-    // cumsum within 1D thread block, which adds up `value` of all threads
-    // whose id is *less than* the current thread
-    // binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
-    // cumsum within warp
-    unsigned int warp_bits = __ballot(value);
-    T warp_sum = __popc(lane_mask_lt() & warp_bits);
-    if (lane_id() == 0)
-        smem[warp_id] = __popc(warp_bits);
-    local_barrier();
-    // cumsum across warps in one thread
-    if (idx == 0) {
-        T sum = smem[0];
-        for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
-            sum += smem[i];
-            smem[i] = sum;
-        }
-    }
-    local_barrier();
-    // load the carry from the preceding warp
-    if (warp_id >= 1)
-        warp_sum += smem[warp_id - 1];
-    return warp_sum;
-}
-// apply raw(byte) offset to pointer
-template <typename T>
-static __device__ inline T* ptr_add(T *ptr, ssize_t offset) {
-    return (T*)((char*)ptr + offset);
-}
-// get array element using raw(byte) offset
-template <typename T>
-static __device__ inline T& ptr_at(T *ptr, ssize_t offset) {
-    return *((T*)((char*)ptr + offset));
-}
-// read array element using raw(byte) offset
-template <typename T>
-static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
-    return __ldg(((T*)((char*)ptr + offset)));
-}
-/* NB: __ldg is not defined for ga_half, so we must specialize ptr_read_cached.
- * To do it, I try to use a built-in type that should have the same size as ga_half.
- * Based on current ga_half implementation (2017/11/27), it should be ga_ushort.
- * This code must be updated every time ga_half implementation size changes,
- * until a better code be provided. */
-#define GA_HALF_STD_TYPE ga_ushort
-static __device__ inline ga_half ptr_read_cached(ga_half *ptr, ssize_t offset) {
-    int check_ga_half_std_type[ ( ( sizeof(GA_HALF_STD_TYPE) - sizeof(ga_half) ) ? -1 : 1 ) ];
-    GA_HALF_STD_TYPE out = __ldg(((GA_HALF_STD_TYPE*)((char*)ptr + offset)));
-    ga_half real_out;
-    *(GA_HALF_STD_TYPE*)(&real_out) = out;
-    return real_out;
-}
-#undef GA_HALF_STD_TYPE
-/* Comparisons involving ga_half and conversions from integers (e.g. 0, 1) to ga_half lead to compilation errors.
- * Following functions are provided to bypass these issues. */
-template<typename T>
-static __device__ inline T aesara_zero() {return 0;}
-template<>
-__device__ inline ga_half aesara_zero() {return ga_float2half(0);}
-template<typename T>
-static __device__ inline T aesara_one() {return 1;}
-template<>
-__device__ inline ga_half aesara_one() {return ga_float2half(1);}
-template<typename A, typename B> static __device__ inline bool aesara_eq(const A& a, const B& b) {return a == b;}
-template<typename A, typename B> static __device__ inline bool aesara_ne(const A& a, const B& b) {return a != b;}
-template<typename A, typename B> static __device__ inline bool aesara_lt(const A& a, const B& b) {return a < b;}
-template<typename A, typename B> static __device__ inline bool aesara_gt(const A& a, const B& b) {return a > b;}
-template<typename A, typename B> static __device__ inline bool aesara_le(const A& a, const B& b) {return a <= b;}
-template<typename A, typename B> static __device__ inline bool aesara_ge(const A& a, const B& b) {return a >= b;}
-template<typename T> static __device__ inline bool aesara_eq(const ga_half& a, const T& b) {return ga_half2float(a) == b;}
-template<typename T> static __device__ inline bool aesara_ne(const ga_half& a, const T& b) {return ga_half2float(a) != b;}
-template<typename T> static __device__ inline bool aesara_lt(const ga_half& a, const T& b) {return ga_half2float(a) < b;}
-template<typename T> static __device__ inline bool aesara_gt(const ga_half& a, const T& b) {return ga_half2float(a) > b;}
-template<typename T> static __device__ inline bool aesara_le(const ga_half& a, const T& b) {return ga_half2float(a) <= b;}
-template<typename T> static __device__ inline bool aesara_ge(const ga_half& a, const T& b) {return ga_half2float(a) >= b;}
-template<typename T> static __device__ inline bool aesara_eq(const T& a, const ga_half& b) {return a == ga_half2float(b);}
-template<typename T> static __device__ inline bool aesara_ne(const T& a, const ga_half& b) {return a != ga_half2float(b);}
-template<typename T> static __device__ inline bool aesara_lt(const T& a, const ga_half& b) {return a < ga_half2float(b);}
-template<typename T> static __device__ inline bool aesara_gt(const T& a, const ga_half& b) {return a > ga_half2float(b);}
-template<typename T> static __device__ inline bool aesara_le(const T& a, const ga_half& b) {return a <= ga_half2float(b);}
-template<typename T> static __device__ inline bool aesara_ge(const T& a, const ga_half& b) {return a >= ga_half2float(b);}
-static __device__ inline bool aesara_eq(const ga_half& a, const ga_half& b) {return ga_half2float(a) == ga_half2float(b);}
-static __device__ inline bool aesara_ne(const ga_half& a, const ga_half& b) {return ga_half2float(a) != ga_half2float(b);}
-static __device__ inline bool aesara_lt(const ga_half& a, const ga_half& b) {return ga_half2float(a) < ga_half2float(b);}
-static __device__ inline bool aesara_gt(const ga_half& a, const ga_half& b) {return ga_half2float(a) > ga_half2float(b);}
-static __device__ inline bool aesara_le(const ga_half& a, const ga_half& b) {return ga_half2float(a) <= ga_half2float(b);}
-static __device__ inline bool aesara_ge(const ga_half& a, const ga_half& b) {return ga_half2float(a) >= ga_half2float(b);}
--- a/aesara/gpuarray/c_code/topk_dense.cu
+++ b/aesara/gpuarray/c_code/topk_dense.cu
-#define RADIX_BITS 4
-#define RADIX_SIZE      (1<<RADIX_BITS)
-#define RADIX_MASK(n)   ((RADIX_SIZE-1) << (n*RADIX_BITS))
-#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
-// works when length on axis is within max allowed threads in block (1024)
-extern "C" __global__ void k_topk_dense(
-        $dims
-        // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
-        $dstv
-        // INPUT_TYPE *dstv
-        $dstv_offset
-        // size_t offset
-        $dstv_strides
-        // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
-        $dsti
-        // INDEX_TYPE *dsti
-        $dsti_offset
-        // size_t offset
-        $dsti_strides
-        // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
-        ssize_t k,
-        INPUT_TYPE* src,
-	size_t src_offset,
-        $src_strides
-        // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
-        size_t size) {
-    __shared__ int smem[32 * RADIX_SIZE];
-    __shared__ int k2;
-    const unsigned int idx = threadIdx.x;
-    bool is_topk= (idx < size);
-    bool is_topkth = is_topk;
-    size_t out_idx;
-    const unsigned char warp_id = idx / GA_WARP_SIZE;
-    // 0. get the slice for thread block to work on
-    size_t gid = blockIdx.x, gidx;
-    $set_slice
-    // $$set_slice expands into:
-    //for(int i=1; i<NDIM; i++) {
-        // gidx = gid % dims_$${i};
-        // gid /= dims_$${i};
-        // dsti = ptr_add(dsti, gidx*dsti_strides_$${i};
-        // dstv = ptr_add(dstv, gidx*dstv_strides_$${i};
-        // src = ptr_add(src, gidx*src_strides_$${i});
-    //}
-    // get input and its radix friendly form
-    const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : aesara_zero<INPUT_TYPE>();
-    radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
-    // resolve negative k
-    if (k<0) { x = ~x; k = -k; }
-    if (idx==0)
-        k2 = k;
-    // 1. filter is_topk and is_topkth using radix select
-    #pragma unroll
-    for (int i=bitsof(INPUT_TYPE)-RADIX_BITS; i>=0; i-=RADIX_BITS) {
-        const int digit = Bitfield<radix_t>::get(x, i, RADIX_BITS);
-        /*int digit = (x>>i) & (RADIX_SIZE-1);*/
-        // count within warp
-        #pragma unroll
-        for (int bin=0; bin<RADIX_SIZE; ++bin) {
-            bool vote = (bin == digit) && is_topkth;
-            unsigned int votes = __ballot(vote);
-            if (lane_id()==0)
-                smem[bin + RADIX_SIZE*warp_id] = __popc(votes);
-        }
-        local_barrier();
-        // sum counts across all warps
-        if (idx < RADIX_SIZE) {
-            int sum = smem[idx];
-            #pragma unroll
-            for(int w=RADIX_SIZE; w<blockDim.x*RADIX_SIZE / GA_WARP_SIZE; w+=RADIX_SIZE)
-                sum += smem[idx + w];
-            smem[idx] = sum;
-        }
-        local_barrier();
-        // find the bucket and update k2
-        // smem[:RADIX_SIZE:-1] = k2 - cumsum(smem[:RADIX_SIZE-1:-1])
-        if (idx == 0) {
-            int sum = k2;
-            #pragma unroll
-            for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
-                sum -= smem[bin];
-                smem[bin] = sum;
-                k2 = (sum > 0) ? sum : k2;
-            }
-            smem[RADIX_SIZE] = 1;
-        }
-        local_barrier();
-        if (is_topkth) {
-            is_topk &= (smem[digit+1] > 0);
-            is_topkth &= (smem[digit] <= 0) && (smem[digit+1] > 0);
-        }
-        local_barrier();
-    }
-    // set k2 as number of exceeding values
-    if (idx==0) {
-        #pragma unroll
-        for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
-            if (smem[bin] <= 0)
-                break;
-            k2 = smem[bin];
-        }
-    }
-    local_barrier();
-    // 2. find the index of output array, if exists
-    if (k2 != 0) {
-        // top_kth value may not be unique, so we need to
-        // perform binary cumsum on is_topkth to drop exceeding top-kth values
-        out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topkth);
-        if ((out_idx >= k2) && is_topkth)
-            is_topk = false;
-        local_barrier();
-    }
-    // perform binary cumsum on is_topk to determine the indices to put result
-    out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topk);
-    if (is_topk) {
-#if WRITE_VALUE == 1
-        ptr_at(dstv, out_idx * dstv_strides_0) = xval;
-#endif
-#if WRITE_INDEX == 1
-        ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
-#endif
-    }
-}
--- a/aesara/gpuarray/c_code/topk_dense_large.cu
+++ b/aesara/gpuarray/c_code/topk_dense_large.cu
-#define RADIX_BITS 2
-#define RADIX_SIZE      (1<<RADIX_BITS)
-#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
-#define COUNT_TYPE $count_t
-#define KERNEL_NAME $kname
-// if count_t is int, work for array size within [1025, 2^31-1]
-// if count_t is long long, work for array size within [2^31, 2^63-1]
-template <typename DataType, typename RadixType, typename CountType>
-__device__ DataType find_pattern(DataType* smem,
-                             DataType* data,
-                             CountType slice_size,
-                             CountType stride,
-                             RadixType known_bits,
-                             RadixType known_bits_mask) {
-    if (threadIdx.x < 32)
-        smem[threadIdx.x] = aesara_zero<DataType>();
-    local_barrier();
-    // All threads participate in the loop, in order to sync on the flag
-    for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
-        bool in_range = (i < slice_size);
-        DataType v = in_range ? ptr_read_cached(data, i*stride) : aesara_zero<DataType>();
-        if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
-            // There should not be conflicts if we are using find_pattern,
-            // since the result is unique
-            smem[0] = aesara_one<DataType>();
-            smem[1] = v; // can't use val as the flag, since it could be 0
-        }
-        local_barrier();
-        DataType found = smem[0];
-        DataType val = smem[1];
-        local_barrier();
-        // Check to see if a thread found the value
-        if (aesara_ne(found, 0))
-            return val;
-    }
-    return aesara_zero<DataType>();
-}
-// This function counts the distribution of all input values in a
-// slice we are selecting by radix digit at `radix_digit_pos`, but only
-// those that pass the filter `((v & known_bits_mask) == known_bits)`.
-// This produces and broadcasts the seen counts for a single block only.
-// `smem` must have at least `RADIX_SIZE` elements.
-template <typename DataType, typename RadixType, typename CountType>
-__device__ void count_radix_masked(CountType counts[RADIX_SIZE],
-                                    CountType* smem,
-                                    RadixType known_bits,
-                                    RadixType known_bits_mask,
-                                    int radix_digit_pos,
-                                    CountType slice_size,
-                                    CountType stride,
-                                    DataType* data) {
-    // Clear out per-thread counts from a previous round
-#pragma unroll
-    for (int i = 0; i < RADIX_SIZE; ++i)
-        counts[i] = 0;
-    if (threadIdx.x < RADIX_SIZE)
-        smem[threadIdx.x] = 0;
-    local_barrier();
-    // Scan over all the data. Upon a read, the warp will accumulate
-    // counts per each digit in the radix using warp voting.
-    for (CountType i = threadIdx.x; i < slice_size; i += blockDim.x) {
-        RadixType val = RadixConfig<DataType>::convert(ptr_read_cached(data, i*stride));
-        bool has_val = ((val & known_bits_mask) == known_bits);
-        RadixType digit_in_radix = Bitfield<RadixType>::get(val, radix_digit_pos, RADIX_BITS);
-        #pragma unroll
-        for (int j = 0; j < RADIX_SIZE; ++j) {
-            bool vote = has_val && (digit_in_radix == j);
-            counts[j] += __popc(__ballot(vote));
-        }
-    }
-    // Now, for each warp, sum values
-    if (lane_id() == 0) {
-        for (int i=0; i<RADIX_SIZE; ++i)
-            atomicAdd(&smem[i], counts[i]);
-    }
-    /*
-    // not sure why, but this just give wrong results
-    if (lane_id() < RADIX_SIZE)
-        atomicAdd(&smem[lane_id()], counts[lane_id()]);
-        */
-    local_barrier();
-    // For each thread, read in the total counts
-    #pragma unroll
-    for (unsigned int i = 0; i < RADIX_SIZE; ++i)
-        counts[i] = smem[i];
-    local_barrier();
-}
-template <typename DataType, typename RadixType, typename CountType>
-__device__ void radix_select(DataType* data,
-                            CountType k,
-                            bool order,
-                            CountType slice_size,
-                            CountType stride,
-                            CountType* smem,
-                            DataType* top_kth) {
-    // Per-thread buckets into which we accumulate digit counts in our
-    // radix
-    register CountType counts[RADIX_SIZE];
-    // We only consider elements x such that (x & known_bits_mask) == known_bits
-    // Initially, we consider all elements of the array, so the above
-    // statement is true regardless of input.
-    RadixType known_bits = 0, known_bits_mask = 0;
-    // We are looking for the top k_to_find-th element when iterating over
-    // digits; this count gets reduced by elimination when counting
-    // successive digits
-    CountType k_to_find = abs(k);
-    // We start at the most significant digit in our radix, scanning
-    // through to the least significant digit
-    #pragma unroll
-    for (int digit_pos = bitsof(DataType) - RADIX_BITS;
-            digit_pos >= 0; digit_pos -= RADIX_BITS) {
-        // Count radix distribution for the current position and reduce
-        // across all threads
-        count_radix_masked<DataType, RadixType, CountType>(
-                    counts, smem,
-                    known_bits, known_bits_mask, digit_pos,
-                    slice_size, stride, data);
-        // All threads participate in the comparisons below to know the
-        // final result
-        #define CHECK_RADIX(i) \\
-            int count = counts[i]; \\
-            /* All threads have the same value in counts here, so all */  \\
-            /* threads will return from the function. */  \\
-            if (count == 1 && k_to_find == 1) {  \\
-                /* There is a unique answer. */  \\
-                known_bits = Bitfield<RadixType>::set(  \\
-                    known_bits, i, digit_pos, RADIX_BITS);  \\
-                known_bits_mask = Bitfield<RadixType>::set(  \\
-                    known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS);  \\
-                /* The answer is now the unique element v such that: */  \\
-                /* (v & known_bits_mask) == known_bits */  \\
-                /* However, we do not yet know what the actual element is. We */  \\
-                /* need to perform a search through the data to find the */  \\
-                /* element that matches this pattern. */  \\
-                *top_kth = find_pattern<DataType, RadixType, CountType>(  \\
-                        (DataType*) smem, data, slice_size,  \\
-                        stride, known_bits, known_bits_mask);  \\
-                return;  \\
-            }  \\
-            if (count >= k_to_find) {  \\
-                known_bits = Bitfield<RadixType>::set(known_bits, i, digit_pos, RADIX_BITS);  \\
-                known_bits_mask = Bitfield<RadixType>::set(  \\
-                    known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS);  \\
-                /* The top-Kth element v must now be one such that: */  \\
-                /* (v & known_bits_mask == known_bits) */  \\
-                /* but we haven't narrowed it down; we must check the next */  \\
-                /* least-significant digit */  \\
-                break;  \\
-            }  \\
-            k_to_find -= count
-        if (order) {
-            #pragma unroll
-            for (int i=RADIX_SIZE - 1; i >= 0; --i) {
-                CHECK_RADIX(i);
-            }
-        } else {
-            #pragma unroll
-            for (int i=0; i < RADIX_SIZE; ++i) {
-                CHECK_RADIX(i);
-            }
-        }
-        #undef CHECK_RADIX
-    } // end digit_pos for
-    // There is no unique result, but there is a non-unique result
-    // matching `known_bits` exactly
-    *top_kth = RadixConfig<DataType>::deconvert(known_bits);
-}
-extern "C" __global__ void KERNEL_NAME(
-        $dims
-        // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
-        $dstv
-        // INPUT_TYPE *dstv
-        $dstv_offset
-        // size_t offset
-        $dstv_strides
-        // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
-        $dsti
-        // INDEX_TYPE *dsti
-        $dsti_offset
-        // size_t offset
-        $dsti_strides
-        // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
-        ssize_t k,
-        INPUT_TYPE* src,
-	size_t src_offset,
-        $src_strides
-        // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
-        size_t size) {
-    __shared__ COUNT_TYPE smem[32];
-    INPUT_TYPE topkth_value;
-    const bool order = (k>0);
-    k = (order ? k : -k);
-    const int idx = threadIdx.x;
-    const int warp_id = idx / GA_WARP_SIZE;
-    // get the slice for thread block to work on
-    // size <- the axis to work on
-    // dims_1+ <- batched dimensions
-    unsigned int gid = blockIdx.x, gidx;
-    $set_slice
-    // $$set_slice expands into:
-    //for(int i=1; i<NDIM; i++) {
-        // gidx = gid % dims_$${i};
-        // gid /= dims_$${i};
-        // dsti = ptr_add(dsti, gidx*dsti_strides_$${i});
-        // dstv = ptr_add(dstv, gidx*dstv_strides_$${i});
-        // src = ptr_add(src, gidx*src_strides_$${i});
-    //}
-    radix_select<INPUT_TYPE, radix_t, COUNT_TYPE>(
-        src, k, order, size, src_strides_0,
-        smem, &topkth_value);
-    // Every value that is strictly less/greater than `pattern`
-    // (depending on sort dir) in sorted int format is in the top-K.
-    // The top-K value itself might not be unique.
-    //
-    // Since there are a variable number of elements that we see that
-    // are within the top-k, we don't know at what index to write out
-    // the resulting values.
-    // In order to get this, we perform an exclusive cumsum of
-    // `has_topk`. This will return the resulting index into which we
-    // need to write the result, if a thread has a result.
-    // All threads need to participate in the loop and the cumsum
-    // but not necessarily in the load; hence loop bounds being rounded
-    // up to a multiple of the block dim.
-    COUNT_TYPE iter_bound = size + blockDim.x-1;
-    INDEX_TYPE write_base = 0;
-    for (int i = idx; i < iter_bound; i += blockDim.x) {
-        bool in_range = (i < size);
-        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : aesara_zero<INPUT_TYPE>();
-        bool has_topk;
-        if (order) {
-            has_topk = in_range && (aesara_gt(v, topkth_value));
-        } else {
-            has_topk = in_range && (aesara_lt(v, topkth_value));
-        }
-        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
-        int carry = smem[blockDim.x / 32 - 1];
-        if (has_topk) {
-            COUNT_TYPE write_idx = write_base + index;
-#if WRITE_VALUE == 1
-            ptr_at(dstv, write_idx * dstv_strides_0) = v;
-#endif
-#if WRITE_INDEX == 1
-            ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
-#endif
-        }
-        write_base += carry;
-    }
-    COUNT_TYPE topk_remaining = (k - write_base);
-    for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
-        bool in_range = (i < size);
-        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : aesara_zero<INPUT_TYPE>();
-        bool has_topk = in_range && (aesara_eq(v, topkth_value));
-        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
-        int carry = smem[blockDim.x / 32 - 1];
-        if (has_topk && index < topk_remaining) {
-            COUNT_TYPE write_idx = write_base + index;
-#if WRITE_VALUE == 1
-            ptr_at(dstv, write_idx * dstv_strides_0) = v;
-#endif
-#if WRITE_INDEX == 1
-            ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
-#endif
-        }
-        if (carry >= topk_remaining)
-            break;
-        topk_remaining -= carry;
-        write_base += carry;
-    }
-}
--- a/aesara/gpuarray/ctc.py
+++ b/aesara/gpuarray/ctc.py
-import os
-import sys
-from aesara.configdefaults import config
-from aesara.gpuarray import pygpu
-from aesara.gpuarray.basic_ops import (
-    as_gpuarray_variable,
-    gpu_contiguous,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.elemwise import GpuDimShuffle
-from aesara.gpuarray.type import GpuArrayType, gpu_context_type
-from aesara.gradient import grad_undefined
-from aesara.graph.basic import Apply
-from aesara.graph.opt import local_optimizer
-from aesara.link.c.op import _NoPythonExternalCOp
-from aesara.tensor.basic import as_tensor_variable
-from aesara.tensor.basic_opt import register_canonicalize
-from aesara.tensor.blas import batched_dot
-from aesara.tensor.nnet.ctc import ctc_available
-class GpuConnectionistTemporalClassification(_NoPythonExternalCOp):
-    """
-    GPU wrapper for Baidu CTC loss function.
-    Parameters
-    ----------
-    compute_grad
-        If set to True, enables the computation of gradients of the CTC loss function.
-    """
-    __props__ = ("compute_grad",)
-    _cop_num_inputs = 3
-    _cop_num_outputs = 2
-    func_file = "./c_code/ctc_wrapper.c"
-    func_name = "APPLY_SPECIFIC(ctc_cost_gpu)"
-    params_type = gpu_context_type
-    def __init__(self, compute_grad=True):
-        if not ctc_available():
-            raise RuntimeError(
-                "Baidu CTC is not available and "
-                "GpuConnectionistTemporalClassification Op "
-                "can not be constructed."
-            )
-        self.compute_grad = compute_grad
-        # Return only the cost. Gradient will be returned by grad()
-        self.default_output = 0
-        super().__init__(self.func_file, self.func_name)
-    def c_lib_dirs(self, **kwargs):
-        lib_dirs = []
-        if ctc_available.path is not None:
-            lib_dirs += [ctc_available.path]
-        return lib_dirs
-    def c_compile_args(self, **kwargs):
-        if ctc_available.path is not None:
-            if sys.platform != "darwin" and " " in ctc_available.path:
-                return ['-Wl,-rpath,"' + ctc_available.path + '"']
-            else:
-                return ["-Wl,-rpath," + ctc_available.path]
-        return []
-    def c_libraries(self, **kwargs):
-        return ["warpctc", "gpuarray"]
-    def c_header_dirs(self, **kwargs):
-        dirs = [
-            gpuarray_helper_inc_dir(),
-            pygpu.get_include(),
-            config.cuda__include_path,
-        ]
-        if config.ctc__root != "":
-            dirs.append(os.path.join(config.ctc__root, "include"))
-        return dirs
-    def c_headers(self, **kwargs):
-        return [
-            "ctc.h",
-            "numpy_compat.h",
-            "gpuarray/ext_cuda.h",
-            "gpuarray_helper.h",
-            "gpuarray/types.h",
-            "gpuarray_api.h",
-            "gpuarray/array.h",
-            "gpuarray/util.h",
-            "gpuarray/extension.h",
-        ]
-    def get_params(self, node):
-        return node.inputs[0].type.context
-    def make_node(self, activations, labels, input_lengths):
-        context_name = infer_context_name(activations)
-        t_activations = as_gpuarray_variable(activations, context_name=context_name)
-        # Ensure activations array is C-contiguous
-        t_activations = gpu_contiguous(t_activations)
-        # Labels and input lengths are always on the CPU
-        t_labels = as_tensor_variable(labels)
-        t_input_lengths = as_tensor_variable(input_lengths)
-        if t_activations.type.dtype != "float32":
-            raise TypeError("activations must use the float32 type.")
-        if t_activations.ndim != 3:
-            raise ValueError("activations must have 3 dimensions.")
-        if t_labels.type.dtype != "int32":
-            raise TypeError("labels must use the int32 type.")
-        if t_labels.ndim != 2:
-            raise ValueError("labels must have 2 dimensions.")
-        if t_input_lengths.type.dtype != "int32":
-            raise TypeError("input_lengths must use the int32 type.")
-        if t_input_lengths.ndim != 1:
-            raise ValueError("input_lengths must have 1 dimension.")
-        costs = GpuArrayType(
-            dtype="float32", broadcastable=(False,), context_name=context_name
-        )()
-        outputs = [costs]
-        if self.compute_grad:
-            gradients = GpuArrayType(
-                dtype="float32",
-                broadcastable=(
-                    False,
-                    False,
-                    False,
-                ),
-                context_name=context_name,
-            )()
-            outputs += [gradients]
-        return Apply(
-            self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs
-        )
-    def L_op(self, inputs, outputs, output_grads):
-        # Gradients computed by Op
-        assert self.compute_grad and len(outputs) == 2
-        gradients = outputs[1]
-        assert gradients is not None
-        # Gradients of original function, to compose chain rule
-        grad_op = output_grads[0]
-        grad_shuffle = GpuDimShuffle(
-            input_broadcastable=(
-                False,
-                False,
-                False,
-            ),
-            new_order=(1, 0, 2),
-        )(gradients)
-        grad_bdot = batched_dot(grad_op, grad_shuffle)
-        grad_shuffle_reverse = GpuDimShuffle(
-            input_broadcastable=(
-                False,
-                False,
-                False,
-            ),
-            new_order=(1, 0, 2),
-        )(grad_bdot)
-        return [
-            grad_shuffle_reverse,
-            grad_undefined(self, 1, inputs[1]),
-            grad_undefined(self, 2, inputs[2]),
-        ]
-def gpu_ctc(activations, labels, input_lengths):
-    """
-    Compute CTC loss function on the GPU.
-    Parameters
-    ----------
-    activations
-        Three-dimensional tensor, which has a shape of (t, m, p), where
-        t is the time index, m is the minibatch index, and p is the index
-        over the probabilities of each symbol in the alphabet. The memory
-        layout is assumed to be in C-order, which consists in the slowest
-        to the fastest changing dimension, from left to right. In this case,
-        p is the fastest changing dimension.
-    labels
-        A 2-D tensor of all the labels for the minibatch. In each row, there
-        is a sequence of target labels. Negative values are assumed to be padding,
-        and thus are ignored. Blank symbol is assumed to have index 0 in the
-        alphabet.
-    input_lengths
-        A 1-D tensor with the number of time steps for each sequence in
-        the minibatch.
-    Returns
-    -------
-    1-D array
-        Cost of each example in the minibatch.
-    """
-    return GpuConnectionistTemporalClassification()(activations, labels, input_lengths)
-# Disable gradient computation if not needed
-@register_canonicalize("fast_compile")
-@local_optimizer([GpuConnectionistTemporalClassification])
-def local_gpu_ctc_no_grad(fgraph, node):
-    if isinstance(node.op, GpuConnectionistTemporalClassification):
-        if len(node.outputs) > 1:
-            if len(fgraph.clients[node.outputs[1]]) == 0:  # gradient is not used
-                return [
-                    GpuConnectionistTemporalClassification(compute_grad=False)(
-                        *node.inputs
-                    ),
-                    None,
-                ]
-    return False
--- a/aesara/gpuarray/cudnn_defs.py
+++ b/aesara/gpuarray/cudnn_defs.py
-"""
-Declarations of cuDNN types and constants used in Aesara gpuarray DNN module.
-For every cuDNN API supported by Aesara, this module defines a class that
-provides the set of cuDNN definitions to be used in Aesara Ops.
-Use :func:`get_definitions` to get the right cuDNN definitions
-for a given cuDNN version.
-Currently supported cuDNN APIs:
- - v5.1*
- - v6.0*
- - v7.0*
-"""
-from aesara.link.c.type import CEnumType
-HALF, FLOAT, DOUBLE = ("float16", "float32", "float64")
-TRUE_HALF_CONFIG = (HALF, HALF)
-PSEUDO_HALF_CONFIG = (HALF, FLOAT)
-FLOAT_CONFIG = (FLOAT, FLOAT)
-DOUBLE_CONFIG = (DOUBLE, DOUBLE)
-def is_true_half_config(dtype, precision):
-    return dtype == precision == HALF
-def is_pseudo_half_config(dtype, precision):
-    return dtype == HALF and precision == FLOAT
-def is_float_config(dtype, precision):
-    return dtype == precision == FLOAT
-def is_double_config(dtype, precision):
-    return dtype == precision == DOUBLE
-# NB: Some cuDNN algorithms are listed in cuDNN enums but not implemented.
-# We still register them here because we try to exactly copy cuDNN enums
-# in Python side, but they will have no aliases associated, to help
-# exclude them from lists of supported algorithms.
-class CuDNNV51:
-    version = 5
-    cudnnConvolutionMode_t = CEnumType(
-        ("CUDNN_CONVOLUTION", "conv"),
-        ("CUDNN_CROSS_CORRELATION", "cross"),
-        ctype="cudnnConvolutionMode_t",
-    )
-    cudnnDataType_t = CEnumType(
-        ("CUDNN_DATA_FLOAT", "float32"),
-        ("CUDNN_DATA_DOUBLE", "float64"),
-        ("CUDNN_DATA_HALF", "float16"),
-        ctype="cudnnDataType_t",
-    )
-    cudnnConvolutionFwdAlgo_t = CEnumType(
-        ("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", "none"),
-        ("CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", "small"),
-        ("CUDNN_CONVOLUTION_FWD_ALGO_GEMM", "large"),
-        # not implemented:
-        ("CUDNN_CONVOLUTION_FWD_ALGO_DIRECT"),
-        ("CUDNN_CONVOLUTION_FWD_ALGO_FFT", "fft"),
-        ("CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", "fft_tiling"),
-        ("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", "winograd"),
-        # TODO: Not yet tested/documented:
-        ("CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
-        ctype="cudnnConvolutionFwdAlgo_t",
-    )
-    conv3d_fwd_algorithms = ("none", "small", "fft_tiling")
-    deterministic_fwd_algorithms = cudnnConvolutionFwdAlgo_t.get_aliases()
-    cudnnConvolutionBwdFilterAlgo_t = CEnumType(
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", "none"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1", "deterministic"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", "fft"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3", "small"),
-        # TODO: not yet tested/documented:
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
-        ctype="cudnnConvolutionBwdFilterAlgo_t",
-    )
-    conv3d_bwd_filter_algorithms = ("none", "small")
-    deterministic_bwd_filter_algorithms = ("deterministic", "fft", "winograd_non_fused")
-    cudnnConvolutionBwdDataAlgo_t = CEnumType(
-        ("CUDNN_CONVOLUTION_BWD_DATA_ALGO_0", "none"),
-        ("CUDNN_CONVOLUTION_BWD_DATA_ALGO_1", "deterministic"),
-        ("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT", "fft"),
-        ("CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING", "fft_tiling"),
-        ("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD", "winograd"),
-        # TODO: not yet tested/documented:
-        ("CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
-        ctype="cudnnConvolutionBwdDataAlgo_t",
-    )
-    conv3d_bwd_data_algorithms = ("none", "deterministic", "fft_tiling")
-    deterministic_bwd_data_algorithms = (
-        "deterministic",
-        "fft",
-        "fft_tiling",
-        "winograd",
-        "winograd_non_fused",
-    )
-    cudnnPoolingMode_t = CEnumType(
-        ("CUDNN_POOLING_MAX", "max"),
-        ("CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "average_inc_pad"),
-        ("CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "average_exc_pad"),
-        ctype="cudnnPoolingMode_t",
-    )
-    cudnnSoftmaxAlgorithm_t = CEnumType(
-        ("CUDNN_SOFTMAX_FAST", "fast"),
-        ("CUDNN_SOFTMAX_ACCURATE", "accurate"),
-        ("CUDNN_SOFTMAX_LOG", "log"),
-        ctype="cudnnSoftmaxAlgorithm_t",
-    )
-    cudnnSoftmaxMode_t = CEnumType(
-        ("CUDNN_SOFTMAX_MODE_INSTANCE", "instance"),
-        ("CUDNN_SOFTMAX_MODE_CHANNEL", "channel"),
-        ctype="cudnnSoftmaxMode_t",
-    )
-    cudnnBatchNormMode_t = CEnumType(
-        ("CUDNN_BATCHNORM_PER_ACTIVATION", "per-activation"),
-        ("CUDNN_BATCHNORM_SPATIAL", "spatial"),
-        ctype="cudnnBatchNormMode_t",
-    )
-    # It was introduced in cudnnv6, but we need to define it with an
-    # empty list of enum to don't crash with cudnn 5
-    cudnnReduceTensorOp_t = CEnumType()
-    def get_supported_dtype_configs(self, check_runtime=None):
-        """
-        Return the tuple of data type configurations supported by this version of cuDNN.
-        This is currently convenient for all supported cuDNN versions, as Aesara does not
-        yet support new data types (like INT8, INT8x4, etc.).
-        ``check_runtime`` may be a function that tests if a data type configuration is supported.::
-            is_supported = check_runtime(dtype, precision)
-        .. warning::
-            From documentation for cudnnConvolutionForward (for both v5.1 and v6):
-            .. code-block::
-                TRUE_HALF_CONFIG is only supported on architectures with true fp16 support
-                (compute capability 5.3 and 6.0)
-            This seems to be a general remark about f16 support (not only for FWD).
-            It can be checked at runtime only.
-        """
-        if check_runtime is None or check_runtime(*TRUE_HALF_CONFIG):
-            return (TRUE_HALF_CONFIG, PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
-        return (PSEUDO_HALF_CONFIG, FLOAT_CONFIG, DOUBLE_CONFIG)
-    def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
-        algorithms = self.cudnnConvolutionFwdAlgo_t
-        algo = algorithms.fromalias(algo)
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
-            return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
-            return ndim == 2 or not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
-            return ndim == 2 and not is_true_half_config(dtype, precision)
-        # CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: not implemented.
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT:
-            return ndim == 2 and (
-                is_pseudo_half_config(dtype, precision)
-                or is_float_config(dtype, precision)
-            )
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-            if ndim == 2:
-                return is_pseudo_half_config(dtype, precision) or is_float_config(
-                    dtype, precision
-                )
-            if ndim == 3:
-                return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
-            return ndim == 2 and (
-                is_pseudo_half_config(dtype, precision)
-                or is_float_config(dtype, precision)
-            )
-        if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
-            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
-            # We could not check it before being in C code.
-            return ndim == 2 and not is_double_config(dtype, precision)
-        return False
-    def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
-        # NB: Aesara does not support float16 precision anymore for backward cuDNN convolutions.
-        if is_true_half_config(dtype, precision):
-            return False
-        algorithms = self.cudnnConvolutionBwdFilterAlgo_t
-        algo = algorithms.fromalias(algo)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
-            return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
-            return ndim == 2
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
-            return ndim == 2 and (
-                is_pseudo_half_config(dtype, precision)
-                or is_float_config(dtype, precision)
-            )
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
-            return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
-            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
-            # We could not check it before being in C code.
-            return ndim == 2 and not is_double_config(dtype, precision)
-        return False
-    def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
-        # NB: Aesara does not support float16 precision anymore for backward cuDNN convolutions.
-        if is_true_half_config(dtype, precision):
-            return False
-        algorithms = self.cudnnConvolutionBwdDataAlgo_t
-        algo = algorithms.fromalias(algo)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
-            return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
-            # CUDNN_CONVOLUTION_BWD_DATA_ALGO_1: all data type configs supported.
-            # NB: Let's avoid float16 precision, as some strange errors may be encountered
-            # with that precision ( see https://github.com/Theano/Theano/pull/5932/ )
-            return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
-            return ndim == 2 and (
-                is_pseudo_half_config(dtype, precision)
-                or is_float_config(dtype, precision)
-            )
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
-            if ndim == 2:
-                return is_pseudo_half_config(dtype, precision) or is_float_config(
-                    dtype, precision
-                )
-            if ndim == 3:
-                return not is_true_half_config(dtype, precision)
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
-            return ndim == 2 and (
-                is_pseudo_half_config(dtype, precision)
-                or is_float_config(dtype, precision)
-            )
-        if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
-            # NB: "If wDesc 's filter (height, width) is (5,5), data type config TRUE_HALF_CONFIG is not supported".
-            # We could not check it before being in C code.
-            return ndim == 2 and not is_double_config(dtype, precision)
-        return False
-class CuDNNV6(CuDNNV51):
-    version = 6
-    cudnnDataType_t = CEnumType(
-        ("CUDNN_DATA_FLOAT", "float32"),
-        ("CUDNN_DATA_DOUBLE", "float64"),
-        ("CUDNN_DATA_HALF", "float16"),
-        # new in v6
-        ("CUDNN_DATA_INT8", "int8"),
-        ("CUDNN_DATA_INT32", "int32"),
-        # ('CUDNN_DATA_INT8X4', 'int8x4'),
-        ctype="cudnnDataType_t",
-    )
-    cudnnPoolingMode_t = CEnumType(
-        ("CUDNN_POOLING_MAX", "max"),
-        ("CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING", "average_inc_pad"),
-        ("CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING", "average_exc_pad"),
-        # new in v6:
-        ("CUDNN_POOLING_MAX_DETERMINISTIC", "max_deterministic"),
-        ctype="cudnnPoolingMode_t",
-    )
-    cudnnConvolutionBwdFilterAlgo_t = CEnumType(
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", "none"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1", "deterministic"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT", "fft"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3", "small"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD"),
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED", "winograd_non_fused"),
-        # new in v6:
-        ("CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING", "fft_tiling"),
-        ctype="cudnnConvolutionBwdFilterAlgo_t",
-    )
-    deterministic_bwd_filter_algorithms = (
-        CuDNNV51.deterministic_bwd_filter_algorithms + ("fft_tiling",)
-    )
-    cudnnReduceTensorOp_t = CEnumType(
-        ("CUDNN_REDUCE_TENSOR_ADD", "add"),
-        ("CUDNN_REDUCE_TENSOR_MUL", "mul"),
-        ("CUDNN_REDUCE_TENSOR_MIN", "minimum"),
-        ("CUDNN_REDUCE_TENSOR_MAX", "maximum"),
-        ("CUDNN_REDUCE_TENSOR_AMAX", "absmax"),
-        ("CUDNN_REDUCE_TENSOR_AVG", "avg"),
-        ("CUDNN_REDUCE_TENSOR_NORM1", "norm1"),
-        ("CUDNN_REDUCE_TENSOR_NORM2", "norm2"),
-        ctype="cudnnReduceTensorOp_t",
-    )
-    def fwd_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
-        is_supported = super().fwd_algo_supports_dtype_config(
-            algo, dtype, precision, ndim
-        )
-        if not is_supported:
-            algorithms = self.cudnnConvolutionFwdAlgo_t
-            algo = algorithms.fromalias(algo)
-            if algo == algorithms.CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-                # NB: For cuDNN V6:
-                # "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
-                # (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
-                # ie, one of the filter dimension, width or height is 1)"
-                # Could be checked only in C code. By default, let's allow DOUBLE_CONFIG.
-                return ndim == 2 and (
-                    is_pseudo_half_config(dtype, precision)
-                    or is_float_config(dtype, precision)
-                    or is_double_config(dtype, precision)
-                )
-        return is_supported
-    def bwd_filter_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
-        is_supported = super().bwd_filter_algo_supports_dtype_config(
-            algo, dtype, precision, ndim
-        )
-        if not is_supported:
-            algorithms = self.cudnnConvolutionBwdFilterAlgo_t
-            algo = algorithms.fromalias(algo)
-            if algo == algorithms.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
-                return ndim == 2 and (
-                    is_pseudo_half_config(dtype, precision)
-                    or is_float_config(dtype, precision)
-                    or is_double_config(dtype, precision)
-                )
-        return is_supported
-    def bwd_data_algo_supports_dtype_config(self, algo, dtype, precision, ndim):
-        is_supported = super().bwd_data_algo_supports_dtype_config(
-            algo, dtype, precision, ndim
-        )
-        if not is_supported:
-            algorithms = self.cudnnConvolutionBwdDataAlgo_t
-            algo = algorithms.fromalias(algo)
-            if algo == algorithms.CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
-                # NB: For cuDNN V6:
-                # "Data Type Config Support: PSEUDO_HALF_CONFIG, FLOAT_CONFIG
-                # (DOUBLE_CONFIG is also supported when the task can be handled by 1D FFT,
-                # ie, one of the filter dimension, width or height is 1)"
-                # Could be checked only in C code. By default, let's allow DOUBLE_CONFIG.
-                return ndim == 2 and (
-                    is_pseudo_half_config(dtype, precision)
-                    or is_float_config(dtype, precision)
-                    or is_double_config(dtype, precision)
-                )
-        return is_supported
-class CuDNNV7(CuDNNV6):
-    version = 7
-    cudnnMathType_t = CEnumType(
-        ("CUDNN_DEFAULT_MATH", "non_tensor_op"),
-        ("CUDNN_TENSOR_OP_MATH", "tensor_op"),
-        ctype="cudnnMathType_t",
-    )
-    cudnnDeterminism_t = CEnumType(
-        ("CUDNN_NON_DETERMINISTIC", "non_deterministic"),
-        ("CUDNN_DETERMINISTIC", "deterministic"),
-        ctype="cudnnDeterminism_t",
-    )
-def get_definitions(cudnn_version=None):
-    """
-    Return cuDNN definitions to be used by Aesara for the given cuDNN version.
-    ``cudnn_version`` must be None or an integer
-    (typically the version returned by :func:`Aesara.gpuarray.dnn.version`).
-    if None, return definitions for the  most recent supported cuDNN version.
-    """
-    if cudnn_version is not None:
-        if cudnn_version // 1000 == 5:
-            return CuDNNV51()
-        if cudnn_version // 1000 == 6:
-            return CuDNNV6()
-    # By default, we use definitions for the last supported cuDNN version.
-    return CuDNNV7()
--- a/aesara/gpuarray/dnn.py
+++ b/aesara/gpuarray/dnn.py
--- a/aesara/gpuarray/dnn_opt.py
+++ b/aesara/gpuarray/dnn_opt.py
-import aesara
-from aesara.compile import optdb
-from aesara.gpuarray.basic_ops import (
-    GpuAllocEmpty,
-    GpuArrayType,
-    as_gpuarray_variable,
-    gpu_contiguous,
-    infer_context_name,
-)
-from aesara.gpuarray.dnn import (
-    GpuDnnBatchNorm,
-    GpuDnnBatchNormInference,
-    GpuDnnConv,
-    GpuDnnConvDesc,
-    GpuDnnConvGradI,
-    GpuDnnConvGradW,
-    GpuDnnPoolGrad,
-    GpuDnnReduction,
-    GpuDnnSoftmax,
-    GpuDnnSoftmaxGrad,
-    cudnn,
-    dnn_available,
-    dnn_conv,
-    dnn_conv3d,
-    dnn_pool,
-    get_precision,
-    local_abstractconv3d_cudnn_graph,
-    local_abstractconv_cudnn_graph,
-    version,
-)
-from aesara.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
-from aesara.gpuarray.nnet import GpuSoftmax
-from aesara.gpuarray.opt_util import (
-    alpha_merge,
-    inplace_allocempty,
-    op_lifter,
-    output_merge,
-    pad_dims,
-    unpad_dims,
-)
-from aesara.gpuarray.optdb import (
-    gpu_seqopt,
-    pool_db,
-    pool_db2,
-    register_inplace,
-    register_opt,
-    register_opt2,
-)
-from aesara.gpuarray.reduction import GpuMaxAndArgmax
-from aesara.gpuarray.type import list_contexts
-from aesara.graph.opt import GlobalOptimizer, copy_stack_trace, local_optimizer
-from aesara.scalar import Log
-from aesara.tensor.math import Argmax
-from aesara.tensor.nnet.abstract_conv import (
-    AbstractConv2d,
-    AbstractConv2d_gradInputs,
-    AbstractConv2d_gradWeights,
-    AbstractConv3d,
-    AbstractConv3d_gradInputs,
-    AbstractConv3d_gradWeights,
-    assert_conv_shape,
-    get_conv_output_shape,
-)
-from aesara.tensor.nnet.basic import LogSoftmax, SoftmaxGrad
-from aesara.tensor.shape import shape_i_op
-from aesara.tensor.signal.pool import AveragePoolGrad, MaxPoolGrad, Pool
-@local_optimizer([AbstractConv2d, AbstractConv3d])
-def local_abstractconv_cudnn(fgraph, node):
-    ctx = infer_context_name(*node.inputs)
-    if not isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    if isinstance(node.op, AbstractConv2d):
-        new_out = local_abstractconv_cudnn_graph(
-            node.op, ctx, node.inputs, node.outputs
-        )
-        copy_stack_trace(node.outputs, new_out)
-        return new_out
-    elif isinstance(node.op, AbstractConv3d):
-        new_out = local_abstractconv3d_cudnn_graph(
-            node.op, ctx, node.inputs, node.outputs
-        )
-        copy_stack_trace(node.outputs, new_out)
-        return new_out
-@local_optimizer(
-    [AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]
-)
-def local_abstractconv_cudnn_alt(fgraph, node):
-    if not isinstance(
-        node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)
-    ):
-        return
-    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
-        return None
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-    if not dnn_available(inp1.type.context_name):
-        return
-    op = node.op
-    border_mode = node.op.border_mode
-    subsample = node.op.subsample
-    filter_dilation = node.op.filter_dilation
-    num_groups = node.op.num_groups
-    precision, _ = get_precision(None, [inp1, inp2])
-    if node.op.filter_flip:
-        conv_mode = "conv"
-    else:
-        conv_mode = "cross"
-    if isinstance(op, AbstractConv2d):
-        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
-            return None
-        if border_mode == "full":
-            direction_hint = "bprop inputs"
-        elif border_mode == "valid" and filter_dilation == (1, 1):
-            direction_hint = "bprop weights"
-        else:
-            return None
-        rval = dnn_conv(
-            inp1,
-            inp2,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=filter_dilation,
-            direction_hint=direction_hint,
-            conv_mode=conv_mode,
-            num_groups=num_groups,
-        )
-    elif isinstance(op, AbstractConv2d_gradWeights):
-        if (
-            border_mode == "valid"
-            and subsample == (1, 1)
-            and filter_dilation == (1, 1)
-            and num_groups == 1
-        ):
-            img = gpu_contiguous(inp1)
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(img, topgrad)
-            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
-            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
-            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            out_shp = get_conv_output_shape(
-                ishape,
-                tshape,
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            out_shp = assert_conv_shape(out_shp)
-            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-            desc = GpuDnnConvDesc(
-                border_mode=border_mode,
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode="cross",
-                precision=precision,
-            )(out.shape)
-            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
-            if conv_mode == "conv":
-                conv = conv[:, :, ::-1, ::-1]
-            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
-        else:
-            return None
-    elif isinstance(op, AbstractConv2d_gradInputs):
-        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
-            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(kerns, topgrad)
-            conv_mode = "cross" if conv_mode == "conv" else "conv"
-            desc = GpuDnnConvDesc(
-                border_mode="full",
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode=conv_mode,
-                precision=precision,
-            )(kerns.shape)
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
-            shape = get_conv_output_shape(
-                tshape,
-                kshape,
-                border_mode="full",
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            shape = assert_conv_shape(shape)
-            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
-            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
-                topgrad, kerns, out, desc
-            )
-        else:
-            return None
-    return [rval]
-@local_optimizer(
-    [AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]
-)
-def local_abstractconv3d_cudnn_alt(fgraph, node):
-    if not isinstance(
-        node.op, (AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs)
-    ):
-        return
-    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
-        return None
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-    if not dnn_available(inp1.type.context_name):
-        return
-    op = node.op
-    border_mode = node.op.border_mode
-    subsample = node.op.subsample
-    filter_dilation = node.op.filter_dilation
-    num_groups = node.op.num_groups
-    precision, _ = get_precision(None, [inp1, inp2])
-    if node.op.filter_flip:
-        conv_mode = "conv"
-    else:
-        conv_mode = "cross"
-    if isinstance(op, AbstractConv3d):
-        if border_mode == "half" or subsample != (1, 1, 1) or num_groups > 1:
-            return None
-        if border_mode == "full":
-            direction_hint = "bprop inputs"
-        elif border_mode == "valid" and filter_dilation == (1, 1, 1):
-            direction_hint = "bprop weights"
-        else:
-            return None
-        rval = dnn_conv3d(
-            fgraph,
-            inp1,
-            inp2,
-            border_mode=border_mode,
-            subsample=subsample,
-            dilation=filter_dilation,
-            direction_hint=direction_hint,
-            conv_mode=conv_mode,
-        )
-    elif isinstance(op, AbstractConv3d_gradWeights):
-        if (
-            border_mode == "valid"
-            and subsample == (1, 1, 1)
-            and filter_dilation == (1, 1, 1)
-            and num_groups == 1
-        ):
-            img = gpu_contiguous(inp1)
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(img, topgrad)
-            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
-            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
-            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            out_shp = get_conv_output_shape(
-                ishape,
-                tshape,
-                border_mode=border_mode,
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            out_shp = assert_conv_shape(out_shp)
-            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-            desc = GpuDnnConvDesc(
-                border_mode=border_mode,
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode="cross",
-                num_groups=num_groups,
-                precision=precision,
-            )(out.shape)
-            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
-            if conv_mode == "conv":
-                conv = conv[:, :, ::-1, ::-1, ::-1]
-            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
-        else:
-            return None
-    elif isinstance(op, AbstractConv3d_gradInputs):
-        if border_mode == "valid" and subsample == (1, 1, 1) and num_groups == 1:
-            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
-            topgrad = gpu_contiguous(inp2)
-            ctx_name = infer_context_name(kerns, topgrad)
-            conv_mode = "cross" if conv_mode == "conv" else "conv"
-            desc = GpuDnnConvDesc(
-                border_mode="full",
-                subsample=subsample,
-                dilation=filter_dilation,
-                conv_mode=conv_mode,
-                num_groups=num_groups,
-                precision=precision,
-            )(kerns.shape)
-            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
-            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
-            shape = get_conv_output_shape(
-                tshape,
-                kshape,
-                border_mode="full",
-                subsample=subsample,
-                filter_dilation=filter_dilation,
-            )
-            shape = assert_conv_shape(shape)
-            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
-            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
-                topgrad, kerns, out, desc
-            )
-        else:
-            return None
-    return [rval]
-@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
-def local_abstractconv_gw_cudnn(fgraph, node):
-    ctx = infer_context_name(*node.inputs)
-    if not isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    if isinstance(node.op, AbstractConv2d_gradWeights):
-        new_out = local_abstractconv_cudnn_graph(
-            node.op, ctx, node.inputs, node.outputs
-        )
-        copy_stack_trace(node.outputs, new_out)
-        return new_out
-    elif isinstance(node.op, AbstractConv3d_gradWeights):
-        new_out = local_abstractconv3d_cudnn_graph(
-            node.op, ctx, node.inputs, node.outputs
-        )
-        copy_stack_trace(node.outputs, new_out)
-        return new_out
-@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
-def local_abstractconv_gi_cudnn(fgraph, node):
-    ctx = infer_context_name(*node.inputs)
-    if not isinstance(node.inputs[0].type, GpuArrayType):
-        return
-    if node.op.unshared:
-        return None
-    if isinstance(node.op.border_mode, tuple) and any(
-        isinstance(p, tuple) for p in node.op.border_mode
-    ):
-        # Asymmetric padding not yet supported
-        return None
-    if isinstance(node.op, AbstractConv2d_gradInputs):
-        new_out = local_abstractconv_cudnn_graph(
-            node.op, ctx, node.inputs, node.outputs
-        )
-        copy_stack_trace(node.outputs, new_out)
-        return new_out
-    elif isinstance(node.op, AbstractConv3d_gradInputs):
-        new_out = local_abstractconv3d_cudnn_graph(
-            node.op, ctx, node.inputs, node.outputs
-        )
-        copy_stack_trace(node.outputs, new_out)
-        return new_out
-@inplace_allocempty(GpuDnnConv, 2)
-def local_dnn_conv_inplace(node, inputs):
-    return [
-        GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
-            *inputs
-        )
-    ]
-@inplace_allocempty(GpuDnnConvGradW, 2)
-def local_dnn_convgw_inplace(node, inputs):
-    return [
-        GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
-            *inputs
-        )
-    ]
-@inplace_allocempty(GpuDnnConvGradI, 2)
-def local_dnn_convgi_inplace(node, inputs):
-    return [
-        GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(
-            *inputs
-        )
-    ]
-optdb.register(
-    "local_dnna_conv_inplace",
-    aesara.graph.opt.in2out(
-        local_dnn_conv_inplace,
-        local_dnn_convgw_inplace,
-        local_dnn_convgi_inplace,
-        name="local_dnna_conv_inplace",
-    ),
-    "fast_run",
-    "inplace",
-    "gpuarray",
-    "cudnn",
-    position=70.0,
-)
-@register_opt("cudnn")
-@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
-def local_dnn_conv_alpha_merge(node, *inputs):
-    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
-def local_dnn_convw_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
-def local_dnn_convi_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
-def local_dnn_conv_output_merge(node, *inputs):
-    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
-def local_dnn_convw_output_merge(node, *inputs):
-    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-@register_opt("cudnn")
-@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
-def local_dnn_convi_output_merge(node, *inputs):
-    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
-def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if not op.ignore_border:
-        return
-    img, ws, stride, pad = inputs
-    nd = op.ndim
-    if nd not in (2, 3):
-        return
-    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
-    mode = op.mode
-    # dnn_pool expects exactly 2 non-pooling dimensions
-    if img.ndim == nd + 2:
-        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
-    else:
-        # reshape to 4D or 5D with 2 non-pooling dimensions
-        img_padded = pad_dims(img, 2, nd)
-        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
-        return unpad_dims(ret_padded, img, 2, nd)
-pool_db.register(
-    "local_gpua_pool_dnn_alternative",
-    op_lifter([Pool])(local_gpua_pool_dnn_alternative),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-pool_db2.register(
-    "local_gpua_pool_dnn_alternative",
-    local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-def local_gpua_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if not op.ignore_border:
-        return
-    inp, out, out_grad, ws, stride, pad = inputs
-    nd = op.ndim
-    if nd not in (2, 3):
-        return
-    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
-    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
-    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
-    mode = op.mode
-    # the GPU ops expect exactly 2 non-pooling dimensions
-    if inp.ndim == nd + 2:
-        return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
-    else:
-        # reshape to 4D or 5D with 2 non-pooling dimensions
-        inp_padded = pad_dims(inp, 2, nd)
-        out_padded = pad_dims(out, 2, nd)
-        out_grad_padded = pad_dims(out_grad, 2, nd)
-        ret_padded = GpuDnnPoolGrad(mode=mode)(
-            inp_padded, out_padded, out_grad_padded, ws, stride, pad
-        )
-        return unpad_dims(ret_padded, inp, 2, nd)
-pool_db.register(
-    "local_gpua_pool_dnn_grad_stride",
-    op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-pool_db2.register(
-    "local_gpua_pool_dnn_grad_stride",
-    local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-def local_gpua_avg_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if not op.ignore_border:
-        return
-    inp, out_grad, ws, stride, pad = inputs
-    nd = op.ndim
-    if nd not in (2, 3):
-        return
-    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
-    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
-    mode = op.mode
-    # the GPU ops expect exactly 2 non-pooling dimensions
-    if inp.ndim == nd + 2:
-        # We reuse out_grad because cuDNN does not use the value of the `out`
-        # argument but still checks its shape for average pooling. This
-        # has been observed in v2 and v3 as far as I know.
-        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
-    else:
-        # reshape to 4D or 5D with 2 non-pooling dimensions
-        inp_padded = pad_dims(inp, 2, nd)
-        out_grad_padded = pad_dims(out_grad, 2, nd)
-        ret_padded = GpuDnnPoolGrad(mode=mode)(
-            inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad
-        )
-        return unpad_dims(ret_padded, inp, 2, nd)
-pool_db.register(
-    "local_gpua_avg_pool_dnn_grad_stride",
-    op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-pool_db2.register(
-    "local_gpua_avg_pool_dnn_grad_stride",
-    local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
-    "gpuarray",
-    "fast_compile",
-    "fast_run",
-    "cudnn",
-    position=0,
-)
-@register_opt("cudnn", "fast_compile")
-@local_optimizer([GpuSoftmax])
-def local_softmax_dnn(fgraph, node):
-    if isinstance(node.op, GpuSoftmax):
-        if not dnn_available(node.outputs[0].type.context_name):
-            return
-        ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
-        ins = gpu_contiguous(ins)
-        out = GpuDnnSoftmax("accurate", "channel")(ins)
-        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
-        return [out]
-@register_opt("cudnn", "stabilize")
-@local_optimizer([GpuElemwise])
-def local_log_softmax_dnn(fgraph, node):
-    # This looks for GpuDnnSoftmax so we know that we have cudnn.
-    if (
-        isinstance(node.op, GpuElemwise)
-        and isinstance(node.op.scalar_op, Log)
-        and node.inputs[0].owner
-        and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax)
-        and len(fgraph.clients[node.inputs[0]]) == 1
-    ):
-        softmax_node = node.inputs[0].owner
-        new_softmax = GpuDnnSoftmax("log", softmax_node.op.mode)
-        return [new_softmax(softmax_node.inputs[0])]
-@register_opt("cudnn", "fast_compile")
-@op_lifter([LogSoftmax])
-@register_opt2([LogSoftmax], "fast_compile", "cudnn")
-def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
-    # Transform the input in the format expected by GpuDnnSoftmax
-    inp = inputs[0]
-    if inp.ndim != 2:
-        return
-    if not dnn_available(ctx_name):
-        return
-    inp = inp.dimshuffle(0, 1, "x", "x")
-    inp.tag.context_name = ctx_name
-    # Apply GpuDnnSoftmax and return the result
-    out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
-    return [out.dimshuffle(0, 1)]
-@register_opt("cudnn", "fast_compile")
-@op_lifter([SoftmaxGrad])
-@register_opt2([SoftmaxGrad], "cudnn", "fast_compile")
-def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    ins = []
-    for n in inputs:
-        n = as_gpuarray_variable(n, ctx_name)
-        if n.ndim != 2:
-            return
-        ins.append(n.dimshuffle(0, "x", 1, "x"))
-    out = GpuDnnSoftmaxGrad("accurate", "instance")(
-        gpu_contiguous(ins[0]), gpu_contiguous(ins[1])
-    )
-    return [out.dimshuffle(0, 2)]
-@register_opt("cudnn")
-@local_optimizer([GpuCAReduceCuda])
-def local_dnn_reduction(fgraph, node):
-    if not isinstance(node.op, GpuCAReduceCuda):
-        return
-    if not dnn_available(node.inputs[0].type.context_name):
-        return
-    if version(raises=False) < 6000:
-        return
-    if node.inputs[0].ndim > 8:
-        return
-    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
-    if node.inputs[0].dtype != node.outputs[0].dtype:
-        # We can mix float16 and float32, but not float64.
-        if node.inputs[0].dtype == "float64" or node.outputs[0].dtype == "float64":
-            return
-        if acc_dtype != "float32":
-            return
-    if node.inputs[0].dtype not in ("float16", "float32", "float64"):
-        return
-    if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
-        return
-    if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
-        return
-    if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
-        return
-    def _identity(a):
-        return a
-    def _square(a):
-        return GpuElemwise(aesara.scalar.basic.sqr)(a)
-    scal = node.op.scalar_op.name
-    post = _identity
-    if node.op.pre_scalar_op is not None:
-        if isinstance(node.op.scalar_op, aesara.scalar.basic.Add):
-            if isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Sqr):
-                scal = "norm2"
-                post = _square
-            elif isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Abs):
-                scal = "norm1"
-            else:
-                return
-        elif isinstance(
-            node.op.scalar_op, aesara.scalar.basic.ScalarMaximum
-        ) and isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Abs):
-            scal = "absmax"
-        else:
-            return
-    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
-        return
-    ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(
-        node.inputs[0]
-    )
-    new_out = [post(ret)]
-    copy_stack_trace(node.outputs, new_out)
-    return new_out
-@register_opt("cudnn")
-@local_optimizer([GpuMaxAndArgmax])
-def local_cudnn_maxandargmax(fgraph, node):
-    if not isinstance(node.op, GpuMaxAndArgmax):
-        return
-    if not dnn_available(node.inputs[0].type.context_name):
-        return
-    if version(raises=False) < 6000:
-        return
-    if node.inputs[0].ndim > 8:
-        return
-    if node.inputs[0].dtype != node.outputs[0].dtype:
-        return
-    if node.inputs[0].dtype not in ("float16", "float32", "float64"):
-        return
-    # order of the axes influences the output indices
-    if node.op.axis is not None and tuple(sorted(node.op.axis)) != node.op.axis:
-        return
-    max, arg = GpuDnnReduction(
-        "maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True
-    )(node.inputs[0])
-    # cudnn can only return int32 indices
-    return (
-        max,
-        as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name),
-    )
-@register_opt("cudnn", "fast_compile")
-@op_lifter([Argmax])
-@register_opt2([Argmax], "fast_compile", "cudnn")
-def local_dnn_argmax(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    if version(raises=False) < 6000:
-        return
-    if inputs[0].ndim > 8:
-        return
-    if inputs[0].dtype not in ("float16", "float32", "float64"):
-        return
-    # order of the axes influences the output indices
-    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
-        return
-    max, arg = GpuDnnReduction(
-        "maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True
-    )(*inputs)
-    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
-class NoCuDNNRaise(GlobalOptimizer):
-    def apply(self, fgraph):
-        """
-        Raise a error if cudnn can't be used.
-        """
-        for c in list_contexts():
-            if not dnn_available(c):
-                # Make an assert error as we want Aesara to fail, not
-                # just skip this optimization.
-                raise AssertionError(
-                    "cuDNN optimization was enabled, but Aesara was not able "
-                    "to use it for context "
-                    + str(c)
-                    + ". We got this error: \n"
-                    + dnn_available.msg
-                )
-gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), "cudnn", position=0)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNorm], inplace=True)
-def local_batch_norm_inplace_output(fgraph, node):
-    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
-        return GpuDnnBatchNorm(
-            mode=node.op.mode,
-            running_averages=node.op.running_averages,
-            inplace_running_mean=node.op.inplace_running_mean,
-            inplace_running_var=node.op.inplace_running_var,
-            inplace_output=True,
-        )(*node.inputs)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNorm], inplace=True)
-def local_batch_norm_inplace_running_mean(fgraph, node):
-    if (
-        isinstance(node.op, GpuDnnBatchNorm)
-        and node.op.running_averages
-        and not node.op.inplace_running_mean
-    ):
-        return GpuDnnBatchNorm(
-            mode=node.op.mode,
-            running_averages=node.op.running_averages,
-            inplace_running_mean=True,
-            inplace_running_var=node.op.inplace_running_var,
-            inplace_output=node.op.inplace_output,
-        )(*node.inputs)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNorm], inplace=True)
-def local_batch_norm_inplace_running_var(fgraph, node):
-    if (
-        isinstance(node.op, GpuDnnBatchNorm)
-        and node.op.running_averages
-        and not node.op.inplace_running_var
-    ):
-        return GpuDnnBatchNorm(
-            mode=node.op.mode,
-            running_averages=node.op.running_averages,
-            inplace_running_mean=node.op.inplace_running_mean,
-            inplace_running_var=True,
-            inplace_output=node.op.inplace_output,
-        )(*node.inputs)
-@register_inplace()
-@local_optimizer([GpuDnnBatchNormInference], inplace=True)
-def local_batch_norm_inference_inplace(fgraph, node):
-    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
-        return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
--- a/aesara/gpuarray/elemwise.py
+++ b/aesara/gpuarray/elemwise.py
--- a/aesara/gpuarray/extra_ops.py
+++ b/aesara/gpuarray/extra_ops.py
-from aesara.graph.basic import Apply
-from aesara.graph.op import _NoPythonOp
-from aesara.tensor.extra_ops import CumOp
-try:
-    from pygpu import gpuarray
-except ImportError:
-    pass
-import aesara.scalar as scalar
-from aesara.gpuarray.basic_ops import (
-    GpuKernelBaseCOp,
-    GpuReshape,
-    Kernel,
-    as_gpuarray_variable,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.opt import op_lifter, register_opt, register_opt2
-from aesara.gpuarray.type import gpu_context_type
-from aesara.link.c.params_type import ParamsType
-class GpuCumOp(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Parameters
-    ----------
-    axis
-        Can not be None. If you want the array flattened, do it before.
-    """
-    SUPPORTED_NDIMS = 3
-    __props__ = ("axis", "mode")
-    params_type = ParamsType(axis=scalar.int32, context=gpu_context_type)
-    def __init__(self, axis, mode="add"):
-        assert axis is not None
-        self.axis = int(axis)
-        self.mode = mode
-    def __eq__(self, other):
-        if type(other) != type(self):
-            return False
-        return self.axis == other.axis and self.mode == other.mode
-    def __hash__(self):
-        return hash(self.axis) ^ hash(self.mode)
-    def c_code_cache_version(self):
-        return (7,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def make_node(self, x):
-        assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp"
-        context_name = infer_context_name(x)
-        x = as_gpuarray_variable(x, context_name)
-        if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
-            raise NotImplementedError(
-                "Only cum op on 1D, 2D and\
-                                       3D arrays are supported right now!"
-            )
-        if self.axis >= x.ndim or self.axis < -x.ndim:
-            raise ValueError(f"axis(={self.axis}) out of bounds")
-        return Apply(self, [x], [x.type()])
-    def gpu_kernels(self, node, nodename):
-        kernels = []
-        # cumadd
-        kname = "k_cumadd"
-        op = {"mul": "*", "add": "+"}[self.mode]
-        k_var = "k_cumadd_" + nodename
-        dtype_x = node.inputs[0].dtype
-        flags = Kernel.get_flags(dtype_x)
-        code = (
-            """#include "cluda.h"
-        KERNEL void %(kname)s(float* input, ga_size input_offset,
-                              float* output, ga_size output_offset,
-                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
-                              ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z,
-                              const int offsetY, const int offsetZ,
-                              const int beforeLastElementIdx, const int lastElementIdx){
-            input = (float *)(((char *)input) + input_offset);
-            output = (float *)(((char *)output) + output_offset);
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z;
-            int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z;
-            int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
-            int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
-            int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
-            output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
-            }
-        """
-            % locals()
-        )
-        params = [
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-        ]
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        # blockCumOp
-        kname = "k_blockCumOp"
-        k_var = "k_blockCumOp_" + nodename
-        params = [
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            "int32",
-            "int32",
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-        ]
-        code = (
-            """#include "cluda.h"
-        // helper functions
-        WITHIN_KERNEL
-        void k_reductionPhase(float* partialCumOp) {
-            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
-            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
-                local_barrier();
-                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if (index < blockDim.x*2) {
-                    partialCumOp[index] %(op)s= partialCumOp[index - stride];
-                }
-            }
-        }
-        WITHIN_KERNEL
-        void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
-                         ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
-                         int offsetY, int offsetZ) {
-            // blockIdx.y and blockIdx.z represents the current independent cum op
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
-            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
-            partialCumOp[threadIdx.x*2]     = input[idx_even];
-            partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
-        }
-        WITHIN_KERNEL
-        void k_reversePhase(float* partialCumOp) {
-            // Traverse back up the tree building the scan from the partial sums
-            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
-                local_barrier();
-                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
-                if (index + stride < blockDim.x*2) {
-                    partialCumOp[index + stride] %(op)s= partialCumOp[index];
-                }
-            }
-        }
-        WITHIN_KERNEL
-        void k_pushData(float* partialCumOp, float* output, int globalThreadID,
-                        ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
-                        int offsetY, int offsetZ) {
-            local_barrier();
-            // blockIdx.y and blockIdx.z represents the current independent cum op
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            int offset = idY * dataStrides_y + idZ * dataStrides_z;
-            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
-            output[idx_even] = partialCumOp[threadIdx.x*2];
-            output[idx_odd]  = partialCumOp[threadIdx.x*2 + 1];
-        }
-        KERNEL void k_blockCumOp(float* input, ga_size input_offset,
-                                 float* output, ga_size output_offset,
-                                 size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
-                                 ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
-                                 ga_ssize outputStrides_x, ga_ssize outputStrides_y,
-                                 ga_ssize outputStrides_z, int offsetY,
-                                 int offsetZ, float* blockSum, ga_size blockSum_offset) {
-            input = (float *)(((char *)input) + input_offset);
-            output = (float *)(((char *)output) + output_offset);
-            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
-            // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
-            // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
-            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
-            // Check if current thread has data to process.
-            if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
-                return;
-            }
-            extern __shared__ float partialCumOp[];
-            // Load data in shared memory
-            k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
-            // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
-            // The tree is sweeped from the leaves to the root and from the root to the leaves.
-            // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
-            k_reductionPhase(partialCumOp);
-            k_reversePhase(partialCumOp);
-            // Write the final output to global memory
-            k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
-            if (blockSum != NULL){
-                if (threadIdx.x == blockDim.x - 1) {
-                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
-                }
-            }
-        }
-        """
-            % locals()
-        )
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        # k_finalCumOp
-        kname = "k_finalCumOp"
-        k_var = "k_finalCumOp_" + nodename
-        code = (
-            """#include "cluda.h"
-        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
-                                 float* blockSum, ga_size blockSum_offset,
-                                 size_t nbElementsPerCumOp,
-                                 ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
-                                 int offsetY, int offsetZ) {
-            output = (float *)(((char *)output) + output_offset);
-            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
-            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
-            // Check if current has data to process.
-            if (globalThreadID >= (nbElementsPerCumOp+1)/2)
-                return;
-            int idY = blockIdx.y + offsetY;
-            int idZ = blockIdx.z + offsetZ;
-            const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];
-            int offset = idY * dataStrides_y + idZ * dataStrides_z;
-            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
-            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
-            output[idx_even] %(op)s= currentBlockSum;
-            output[idx_odd] %(op)s= currentBlockSum;
-        }
-        """
-            % locals()
-        )
-        params = [
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            "int32",
-            "int32",
-        ]
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        return kernels
-    def c_code(self, node, nodename, inp, out, sub):
-        if node.inputs[0].type.context.kind != b"cuda":
-            raise NotImplementedError("cuda only")
-        return """
-            const size_t* shape = PyGpuArray_DIMS(%(x)s);
-            bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
-            int axis = %(params)s->axis;
-            if (axis < 0) {
-                // Convert negative axis to positive axis.
-                axis += PyGpuArray_NDIM(%(x)s);
-            }
-            if (aesara_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
-                                   %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
-                %(fail)s;
-            }
-            { // Namespace for kernel calls //
-                size_t max_threads_dim0;
-                size_t max_grid_size1;
-                size_t max_grid_size2;
-                int err;
-                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
-                if (err != GA_NO_ERROR){
-                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
-                    %(fail)s;
-                }
-                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
-                if (err != GA_NO_ERROR){
-                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
-                    %(fail)s;
-                }
-                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
-                if (err != GA_NO_ERROR){
-                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
-                    %(fail)s;
-                }
-                if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
-                    %(fail)s;
-                }
-            }
-        """ % dict(
-            x=inp[0],
-            z=out[0],
-            nodename=nodename,
-            fail=sub["fail"],
-            params=sub["params"],
-        )
-    def c_support_code_struct(self, node, nodename):
-        code = (
-            """
-        int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
-            size_t shape[3] = { 1, 1, 1 };
-            ssize_t inputStrides_x;
-            ssize_t inputStrides_y;
-            ssize_t inputStrides_z;
-            ssize_t outputStrides_x;
-            ssize_t outputStrides_y;
-            ssize_t outputStrides_z;
-            switch (PyGpuArray_NDIM(input))
-            {
-            case 1:
-                shape[0] = PyGpuArray_DIMS(input)[0];
-                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
-                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
-                break;
-            case 2:
-                shape[0] = PyGpuArray_DIMS(input)[0];
-                shape[1] = PyGpuArray_DIMS(input)[1];
-                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
-                inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
-                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
-                outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
-                break;
-            case 3:
-                shape[0] = PyGpuArray_DIMS(input)[0];
-                shape[1] = PyGpuArray_DIMS(input)[1];
-                shape[2] = PyGpuArray_DIMS(input)[2];
-                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
-                inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
-                inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float);
-                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
-                outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
-                outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float);
-                break;
-            default:
-                PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
-                return -1;
-            }
-            if (shape[axis] <= 1) {
-                int err = pygpu_move(output, input);
-                return err;
-            }
-            // Perform cum op on array of even size.
-            size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
-            // Determine how many elements can be processed in one block.
-            size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
-            // Determine how many blocks are needed in total.
-            size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX);  // Nb. of blocks needed per cum op.
-            size_t dimGridY;  // Nb. of independent cum ops (width).
-            size_t dimGridZ;  // Nb. of independent cum ops (height).
-            ssize_t tmp;
-            switch (axis)
-            {
-            case 0:
-                dimGridY = shape[1];
-                dimGridZ = shape[2];
-                break;
-            case 1:
-                dimGridY = shape[0];
-                dimGridZ = shape[2];
-                tmp = inputStrides_x;
-                inputStrides_x = inputStrides_y;
-                inputStrides_y = tmp;
-                tmp = outputStrides_x;
-                outputStrides_x = outputStrides_y;
-                outputStrides_y = tmp;
-                break;
-            case 2:
-                dimGridY = shape[1];
-                dimGridZ = shape[0];
-                tmp = inputStrides_x;
-                inputStrides_x = inputStrides_z;
-                inputStrides_z = tmp;
-                tmp = outputStrides_x;
-                outputStrides_x = outputStrides_z;
-                outputStrides_z = tmp;
-                break;
-            default:
-                PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
-                return -1;
-            }
-            const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
-            PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode,
-                                                           GA_C_ORDER, input->context, Py_None);
-            if (deviceBlockSum == NULL){
-                return -1;
-            }
-            // Perform `maxGridY`*`maxGridZ` cum ops in parallel.
-            for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
-                size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);
-                for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
-                    size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
-                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
-                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
-                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);
-                    int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset);
-                    if (err != GA_NO_ERROR){
-                        PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
-                        return -1;
-                    }
-                    if (dimGridX > 1) {
-                        // Do a cum op over the blockSum (recursive).
-                        if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
-                            Py_DECREF(deviceBlockSum);
-                            return -1;
-                        }
-                        // Since there are more than one block (i.e. `dimGridX > 1`)
-                        //  report partial cum ops of previous blocks to subsequents ones.
-                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
-                        size_t dimBlock[3] = {dimBlockX, 1, 1};
-                        int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
-                        if (err != GA_NO_ERROR){
-                            PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
-                            return -1;
-                        }
-                    }
-                    // If shape[axis] is odd, the last element is compute manually
-                    if (shape[axis] != nbElementsPerCumOp){
-                        size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
-                        size_t dimBlock[3] = {1, 1, 1};
-                        int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1);
-                        if (err != GA_NO_ERROR){
-                            PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
-                            return -1;
-                        }
-                    }
-                }
-            }
-            Py_XDECREF(deviceBlockSum);
-            return 0;
-        }
-        """
-            % locals()
-        )
-        return super().c_support_code_struct(node, nodename) + code
-# GpuCumsumOp exists only to serve backward compatibility.
-# Once an object is created, it will be converted to CumOp object.
-class GpuCumsumOp(GpuKernelBaseCOp, _NoPythonOp):
-    SUPPORTED_NDIMS = 3
-    __props__ = ("axis",)
-    def __new__(typ, *args, **kwargs):
-        obj = object.__new__(GpuCumOp, *args, **kwargs)
-        obj.mode = "add"
-        return obj
-@register_opt("fast_compile")
-@op_lifter([CumOp])
-@register_opt2([CumOp], "fast_compile")
-def local_gpua_cumop(op, ctx_name, inputs, outputs):
-    if inputs[0].dtype != "float32":
-        return False
-    axis = op.axis
-    x = inputs[0]
-    if axis is not None and x.ndim > GpuCumOp.SUPPORTED_NDIMS:
-        return False
-    x = as_gpuarray_variable(x, ctx_name)
-    if axis is None and x.ndim > 1:
-        x = GpuReshape(1)(x, (-1,))
-    # ``gpu_cumop`` assume array has been flattened if needed.
-    if axis is None:
-        axis = 0
-    return GpuCumOp(axis, op.mode)(x)
--- a/aesara/gpuarray/fft.py
+++ b/aesara/gpuarray/fft.py
-import numpy as np
-from aesara.gpuarray.basic_ops import (
-    as_gpuarray_variable,
-    gpu_contiguous,
-    infer_context_name,
-)
-from aesara.gpuarray.opt import op_lifter, register_opt, register_opt2
-from aesara.gpuarray.type import GpuArrayType
-from aesara.gradient import DisconnectedType
-from aesara.graph.basic import Apply
-from aesara.graph.op import _NoPythonOp
-from aesara.tensor.basic import as_tensor_variable
-from aesara.tensor.fft import IRFFTOp
-from aesara.tensor.math import sqrt
-from aesara.tensor.subtensor import set_subtensor
-from aesara.tensor.type import integer_dtypes
-try:
-    import pygpu
-    pygpu_available = True
-except ImportError:
-    pygpu_available = False
-try:
-    import pycuda.driver
-    pycuda_available = True
-except ImportError:
-    pycuda_available = False
-try:
-    import skcuda
-    from skcuda import fft
-    skcuda_available = True
-except Exception:
-    skcuda_available = False
-class CuRFFTOp(_NoPythonOp):
-    __props__ = ()
-    def output_type(self, inp):
-        # add one extra dim for real/imag
-        return GpuArrayType(
-            inp.dtype,
-            broadcastable=[False] * (inp.type.ndim + 1),
-            context_name=inp.type.context_name,
-        )
-    def make_node(self, inp, s=None):
-        # A shape parameter s can be provided as an input. For now this is used to
-        # manage odd transform sizes.
-        # Later this could be extended to handle padding and trunkation,
-        # following numpy's interface. However, cuFFT expects array that match
-        # the shape given to the plan, so padding will have to be done in the op.
-        # The effect of padding on gradients has yet to be investigated.
-        if not skcuda_available:
-            raise RuntimeError("skcuda is needed for CuFFTOp")
-        if not pygpu_available:
-            raise RuntimeError("pygpu is needed for CuFFTOp")
-        if not pycuda_available:
-            raise RuntimeError("pycuda is needed for CuFFTOp")
-        inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp)))
-        # If no shape is provided as input, default to input data shape.
-        if s is None:
-            s = inp.shape[1:]
-        s = as_tensor_variable(s)
-        assert inp.dtype == "float32"
-        assert s.ndim == 1
-        assert s.dtype in integer_dtypes
-        return Apply(self, [inp, s], [self.output_type(inp)()])
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        # Initialize cuda context to the input's.
-        with node.inputs[0].type.context:
-            skcuda.misc.init()
-        plan_input_shape = [None]
-        plan = [None]
-        def thunk():
-            input_shape = inputs[0][0].shape
-            s = inputs[1][0]
-            # Since padding is not supported, assert s matches input shape.
-            assert (input_shape[1:] == s).all()
-            # construct output shape
-            output_shape = [input_shape[0]] + list(s)
-            # DFT of real input is symmetric, no need to store
-            # redundant coefficients
-            output_shape[-1] = output_shape[-1] // 2 + 1
-            # extra dimension with length 2 for real/imag
-            output_shape += [2]
-            output_shape = tuple(output_shape)
-            z = outputs[0]
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = pygpu.zeros(
-                    output_shape, context=inputs[0][0].context, dtype="float32"
-                )
-            input_pycuda = inputs[0][0]
-            # I thought we'd need to change the type on output_pycuda
-            # so it is complex64, but as it turns out skcuda.fft
-            # doesn't really care either way and treats the array as
-            # if it is complex64 anyway.
-            output_pycuda = z[0]
-            with input_pycuda.context:
-                # only initialise plan if necessary
-                if plan[0] is None or plan_input_shape[0] != input_shape:
-                    plan_input_shape[0] = input_shape
-                    plan[0] = fft.Plan(
-                        s, np.float32, np.complex64, batch=input_shape[0]
-                    )
-                # Sync GPU variables before computation
-                input_pycuda.sync()
-                output_pycuda.sync()
-                fft.fft(input_pycuda, output_pycuda, plan[0])
-                # Sync results to ensure output contains completed computation
-                pycuda.driver.Context.synchronize()
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-    def grad(self, inputs, output_grads):
-        (gout,) = output_grads
-        s = inputs[1]
-        # Divide the last dimension of the output gradients by 2, they are
-        # double-counted by the real-IFFT due to symmetry, except the first
-        # and last elements (for even transforms) which are unique.
-        idx = (
-            [slice(None)] * (gout.ndim - 2)
-            + [slice(1, (s[-1] // 2) + (s[-1] % 2))]
-            + [slice(None)]
-        )
-        gout = set_subtensor(gout[idx], gout[idx] * 0.5)
-        return [cuirfft_op(gout, s), DisconnectedType()()]
-    def connection_pattern(self, node):
-        # Specify that shape input parameter has no connection to graph and gradients.
-        return [[True], [False]]
-curfft_op = CuRFFTOp()
-class CuIRFFTOp(_NoPythonOp):
-    __props__ = ()
-    def output_type(self, inp):
-        # remove extra dim for real/imag
-        return GpuArrayType(
-            inp.dtype,
-            broadcastable=[False] * (inp.type.ndim - 1),
-            context_name=inp.type.context_name,
-        )
-    def make_node(self, inp, s=None):
-        # A shape parameter is expected as an input. For now this is used to
-        # manage odd transform sizes.
-        # Later this could be extended to handle padding and trunkation,
-        # following numpy's interface. However, cuFFT expects array that match
-        # the shape given to the plan, so padding will have to be done in the op.
-        # The effect of padding on gradients has yet to be investigated.
-        if not skcuda_available:
-            raise RuntimeError("skcuda is needed for CuIFFTOp")
-        if not pygpu_available:
-            raise RuntimeError("pygpu is needed for CuIFFTOp")
-        if not pycuda_available:
-            raise RuntimeError("pycuda is needed for CuIFFTOp")
-        inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp)))
-        # If no shape is provided as input, calculate shape assuming even real transform.
-        if s is None:
-            s = inp.shape[1:-1]
-            s = set_subtensor(s[-1], (s[-1] - 1) * 2)
-        s = as_tensor_variable(s)
-        assert inp.dtype == "float32"
-        assert s.ndim == 1
-        return Apply(self, [inp, s], [self.output_type(inp)()])
-    def make_thunk(self, node, storage_map, _, _2, impl=None):
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-        # Initialize cuda context to the input's.
-        with node.inputs[0].type.context:
-            skcuda.misc.init()
-        plan_input_shape = [None]
-        plan = [None]
-        def thunk():
-            input_shape = inputs[0][0].shape
-            s = inputs[1][0]
-            # Since padding is not supported, assert that last dimension corresponds to
-            # input forward transform size.
-            assert (input_shape[1:-2] == s[:-1]).all()
-            assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all()
-            # construct output shape
-            # chop off the extra length-2 dimension for real/imag
-            output_shape = [input_shape[0]] + list(s)
-            output_shape = tuple(output_shape)
-            z = outputs[0]
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = pygpu.zeros(
-                    output_shape, context=inputs[0][0].context, dtype="float32"
-                )
-            input_pycuda = inputs[0][0]
-            # input_pycuda is a float32 array with an extra dimension,
-            # but will be interpreted by skcuda as a complex64
-            # array instead.
-            output_pycuda = z[0]
-            with input_pycuda.context:
-                # only initialise plan if necessary
-                if plan[0] is None or plan_input_shape[0] != input_shape:
-                    plan_input_shape[0] = input_shape
-                    plan[0] = fft.Plan(
-                        s, np.complex64, np.float32, batch=output_shape[0]
-                    )
-                # Sync GPU variables before computation
-                input_pycuda.sync()
-                output_pycuda.sync()
-                fft.ifft(input_pycuda, output_pycuda, plan[0])
-                # strangely enough, enabling rescaling here makes it run
-                # very, very slowly, so do this rescaling manually
-                # afterwards!
-                # Sync results to ensure output contains completed computation
-                pycuda.driver.Context.synchronize()
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        return thunk
-    def grad(self, inputs, output_grads):
-        (gout,) = output_grads
-        s = inputs[1]
-        gf = curfft_op(gout, s)
-        # Multiply the last dimension of the gradient by 2, they represent
-        # both positive and negative frequencies, except the first
-        # and last elements (for even transforms) which are unique.
-        idx = (
-            [slice(None)] * (gf.ndim - 2)
-            + [slice(1, (s[-1] // 2) + (s[-1] % 2))]
-            + [slice(None)]
-        )
-        gf = set_subtensor(gf[idx], gf[idx] * 2)
-        return [gf, DisconnectedType()()]
-    def connection_pattern(self, node):
-        # Specify that shape input parameter has no connection to graph and gradients.
-        return [[True], [False]]
-cuirfft_op = CuIRFFTOp()
-def curfft(inp, norm=None):
-    r"""
-    Performs the fast Fourier transform of a real-valued input on the GPU.
-    The input must be a real-valued float32 variable of dimensions (m, ..., n).
-    It performs FFTs of size (..., n) on m batches.
-    The output is a GpuArray of dimensions (m, ..., n//2+1, 2). The second to
-    last dimension of the output contains the n//2+1 non-trivial elements of
-    the real-valued FFTs. The real and imaginary parts are stored as a pair of
-    float32 arrays.
-    Parameters
-    ----------
-    inp
-        Array of real-valued float32 of size (m, ..., n), containing m inputs of
-        size (..., n).
-    norm : {None, 'ortho', 'no_norm'}
-        Normalization of transform. Following numpy, default *None* normalizes
-        only the inverse transform by n, 'ortho' yields the unitary transform
-        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
-        the transform unnormalized.
-    """
-    s = inp.shape[1:]
-    cond_norm = _unitary(norm)
-    scaling = 1
-    if cond_norm == "ortho":
-        scaling = sqrt(s.prod().astype("float32"))
-    return curfft_op(inp, s) / scaling
-def cuirfft(inp, norm=None, is_odd=False):
-    r"""
-    Performs the inverse fast Fourier Transform with real-valued output on the GPU.
-    The input is a variable of dimensions (m, ..., n//2+1, 2) with
-    type float32 representing the non-trivial elements of m
-    real-valued Fourier transforms of initial size (..., n). The real and
-    imaginary parts are stored as a pair of float32 arrays.
-    The output is a real-valued float32 variable of dimensions (m, ..., n)
-    giving the m inverse FFTs.
-    Parameters
-    ----------
-    inp
-        Array of float32 of size (m, ..., n//2+1, 2), containing m inputs
-        with n//2+1 non-trivial elements on the last dimension and real
-        and imaginary parts stored as separate arrays.
-    norm : {None, 'ortho', 'no_norm'}
-        Normalization of transform. Following numpy, default *None* normalizes
-        only the inverse transform by n, 'ortho' yields the unitary transform
-        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
-        the transform unnormalized.
-    is_odd : {True, False}
-        Set to True to get a real inverse transform output with an odd last dimension
-        of length (N-1)*2 + 1 for an input last dimension of length N.
-    """
-    if is_odd not in (True, False):
-        raise ValueError(f"Invalid value {is_odd} for id_odd, must be True or False")
-    s = inp.shape[1:-1]
-    if is_odd:
-        s = set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
-    else:
-        s = set_subtensor(s[-1], (s[-1] - 1) * 2)
-    cond_norm = _unitary(norm)
-    scaling = 1
-    if cond_norm is None:
-        scaling = s.prod().astype("float32")
-    elif cond_norm == "ortho":
-        scaling = sqrt(s.prod().astype("float32"))
-    return cuirfft_op(inp, s) / scaling
-def _unitary(norm):
-    if norm not in (None, "ortho", "no_norm"):
-        raise ValueError(
-            f"Invalid value {norm} for norm, must be None, 'ortho' or 'no norm'"
-        )
-    return norm
-if skcuda_available:
-    @register_opt("fast_compile")
-    @op_lifter([IRFFTOp])
-    @register_opt2([IRFFTOp], "fast_compile")
-    def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
-        return curfft_op
-    @register_opt("fast_compile")
-    @op_lifter([IRFFTOp])
-    @register_opt2([IRFFTOp], "fast_compile")
-    def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
-        return cuirfft_op
--- a/aesara/gpuarray/fp16_help.py
+++ b/aesara/gpuarray/fp16_help.py
-def work_dtype(dtype):
-    """
-    Return the data type for working memory.
-    """
-    if dtype == "float16":
-        return "float32"
-    else:
-        return dtype
-def load_w(dtype):
-    """
-    Return the function name to load data.
-    This should be used like this::
-        code = '%s(ival)' % (load_w(input_type),)
-    """
-    if dtype == "float16":
-        return "ga_half2float"
-    else:
-        return ""
-def write_w(dtype):
-    """
-    Return the function name to write data.
-    This should be used like this::
-        code = 'res = %s(oval)' % (write_w(output_type),)
-    """
-    if dtype == "float16":
-        return "ga_float2half"
-    else:
-        return ""
--- a/aesara/gpuarray/kernel_codegen.py
+++ b/aesara/gpuarray/kernel_codegen.py
-"""
-Helper routines for generating gpu kernels for nvcc.
-"""
-try:
-    from pygpu import gpuarray
-except ImportError:
-    pass
-def nvcc_kernel(name, params, body):
-    """
-    Return the c code of a kernel function.
-    Parameters
-    ----------
-    params
-        The parameters to the function as one or more strings.
-    body
-        The [nested] list of statements for the body of the function.
-        These will be separated by ';' characters.
-    """
-    paramstr = ", ".join(params)
-    def flatbody():
-        for b in body:
-            if isinstance(b, (list, tuple)):
-                yield from b
-            else:
-                yield b
-    bodystr = ";\n".join(flatbody())
-    return (
-        """#include "cluda.h"
-    KERNEL void %(name)s (%(paramstr)s)
-    {
-        %(bodystr)s;
-    }
-    """
-        % locals()
-    )
-def code_version(version):
-    """
-    Decorator to support version-based cache mechanism.
-    """
-    if not isinstance(version, tuple):
-        raise TypeError("version must be tuple", version)
-    def deco(f):
-        f.code_version = version
-        return f
-    return deco
-UNVERSIONED = ()
-@code_version((2,))
-def inline_reduce(N, buf, pos, count, manner_fn):
-    """
-    Return C++ code for a function that reduces a contiguous buffer.
-    Parameters
-    ----------
-    N
-        Length of the buffer.
-    buf
-        buffer pointer.
-    pos
-        Index of executing thread.
-    count
-        Number of executing threads.
-    manner_fn
-        A function that accepts strings of arguments a and b, and
-        returns c code for their reduction.
-          return "%(a)s + %(b)s"
-        for a sum reduction.
-    Notes
-    -----
-    `buf` should be in gpu shared memory, we access it many times.
-    This function leaves the answer in position 0 of the buffer. The
-    rest of the buffer is trashed by this function.
-    """
-    loop_line = manner_fn(f"{buf}[{pos}]", f"{buf}[i]")
-    r_n = manner_fn(f"{buf}[{pos}]", f"{buf}[{pos}+_n]")
-    return (
-        """
-    {
-        // This function trashes buf[1..warpSize],
-        // leaving the reduction result in buf[0].
-        if (%(pos)s < warpSize) {
-            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
-            {
-                %(buf)s[%(pos)s] = %(loop_line)s;
-            }
-        }
-        __syncthreads();
-        //reduce so that %(pos)s 0 has the reduction of everything
-        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
-          if (%(pos)s < _n && %(pos)s + _n < %(N)s)
-            %(buf)s[%(pos)s] = %(r_n)s;
-          __syncthreads();
-        }
-    }
-    """
-        % locals()
-    )
-@code_version(inline_reduce.code_version)
-def inline_reduce_max(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: f"max({a}, {b})")
-@code_version(inline_reduce.code_version)
-def inline_reduce_sum(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: f"{a} + {b}")
-@code_version(inline_reduce.code_version)
-def inline_reduce_min(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: f"min({a}, {b})")
-@code_version(inline_reduce.code_version)
-def inline_reduce_prod(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: f"{a} * {b}")
-@code_version((2,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
-def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
-    """
-    Generate code for a softmax.
-    On entry, `buf` and `buf2` must contain two identical copies of
-    the input to softmax.
-    After the code returns `buf` contains the softmax, `buf2` contains
-    un-normalized softmax.
-    Parameters
-    ----------
-    N
-        Length of the buffer.
-    threadPos
-        Index of executing thread.
-    threadCount
-        Number of executing threads.
-    dtype
-        Dtype of the softmax's output.
-    Notes
-    -----
-    `buf` and `buf2` should be in gpu shared memory, we access it many
-    times.
-    We use __i as an int variable in a loop.
-    """
-    ctype = gpuarray.dtype_to_ctype(dtype)
-    # get max of buf (trashing all but buf[0])
-    return [
-        inline_reduce_max(N, buf, threadPos, threadCount),
-        "__syncthreads()",
-        f"{ctype} row_max = {buf}[0]",
-        "__syncthreads()",
-        f"for(int __i={threadPos}; __i<{N}; __i+={threadCount}){{",
-        f"{buf}[__i] = exp({buf2}[__i] - row_max)",
-        f"{buf}[__i] = {buf}[__i]",
-        "}",
-        "__syncthreads()",
-        inline_reduce_sum(N, buf, threadPos, threadCount),
-        "__syncthreads()",
-        f"{ctype} row_sum = {buf}[0]",
-        "__syncthreads()",
-        # divide each exp() result by the sum to complete the job.
-        f"for(int __i={threadPos}; __i<{N}; __i+={threadCount}){{",
-        f"{buf}[__i] = {buf2}[__i] / row_sum",
-        "}",
-        "__syncthreads()",
-    ]
-@code_version((3,))
-def inline_reduce_fixed_shared(
-    N,
-    buf,
-    x,
-    stride_x,
-    load_x,
-    pos,
-    count,
-    manner_fn,
-    manner_init,
-    b="",
-    stride_b="",
-    load_b="",
-    dtype="float32",
-):
-    """
-    Return C++ code for a function that reduces a contiguous buffer.
-    This function leaves the answer in position 0 of the buffer. The
-    rest of the buffer is trashed by this function.
-    Parameters
-    ----------
-    N
-        Length of the buffer.
-    buf
-        Buffer pointer of size warpSize * sizeof(dtype).
-    x
-        Input data.
-    stride_x
-        Input data stride.
-    load_x
-        Wrapper to read from x.
-    pos
-        Index of executing thread.
-    count
-        Number of executing threads.
-    manner_fn
-        A function that accepts strings of arguments a and b, and
-        returns c code for their reduction.
-          return "%(a)s + %(b)s"
-        for a sum reduction.
-    manner_init
-        A function that accepts strings of arguments a and return c
-        code for its initialization.
-    b
-        Optional, pointer to the bias.
-    stride_b
-        Optional, the stride of b if b is provided.
-    load_b
-        Optional, wrapper to read from b if b is provided.
-    dtype
-        Optional, the dtype of the output.
-    Notes
-    -----
-    `buf` should be in gpu shared memory, we access it many times.
-    """
-    if b:
-        init = manner_init(
-            "%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
-            " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals()
-        )
-        loop_line = manner_fn(
-            "red",
-            manner_init(
-                "%(load_x)s(%(x)s[i * %(stride_x)s]) + "
-                "%(load_b)s(%(b)s[i * %(stride_b)s])" % locals()
-            ),
-        )
-    else:
-        init = manner_init(f"{load_x}({x}[{pos} * {stride_x}])")
-        loop_line = manner_fn(
-            "red",
-            manner_init(f"{load_x}({x}[i * {stride_x}])"),
-        )
-    loop_line2 = manner_fn(f"{buf}[{pos}]", f"{buf}[i]")
-    r_n = manner_fn(f"{buf}[{pos}]", f"{buf}[{pos}+_n]")
-    ctype = gpuarray.dtype_to_ctype(dtype)
-    return (
-        """
-    {
-        // This function trashes buf[1..n_threads],
-        // leaving the reduction result in buf[0].
-        %(ctype)s red = %(init)s;
-        #pragma unroll 16
-        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) {
-          red = %(loop_line)s;
-        }
-        buf[%(pos)s] = red;
-        __syncthreads();
-        if (%(pos)s < warpSize) {
-            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) {
-                %(buf)s[%(pos)s] = %(loop_line2)s;
-            }
-        }
-        __syncthreads();
-        //reduce so that %(pos)s 0 has the reduction of everything
-        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
-          if (%(pos)s < _n && %(pos)s + _n < %(N)s)
-            %(buf)s[%(pos)s] = %(r_n)s;
-          __syncthreads();
-        }
-    }
-    """
-        % locals()
-    )
-@code_version(inline_reduce_fixed_shared.code_version)
-def inline_reduce_fixed_shared_max(
-    N,
-    buf,
-    x,
-    stride_x,
-    load_x,
-    pos,
-    count,
-    b="",
-    stride_b="",
-    load_b="",
-    dtype="float32",
-):
-    return inline_reduce_fixed_shared(
-        N,
-        buf,
-        x,
-        stride_x,
-        load_x,
-        pos,
-        count,
-        lambda a, b: f"max({a}, {b})",
-        lambda a: a,
-        b,
-        stride_b,
-        load_b,
-        dtype,
-    )
-@code_version((2,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
-def inline_softmax_fixed_shared(
-    N,
-    buf,
-    x,
-    stride_x,
-    load_x,
-    sm,
-    sm_stride,
-    write_sm,
-    threadPos,
-    threadCount,
-    b="",
-    stride_b="",
-    load_b="",
-    dtype="float32",
-):
-    """
-    Generate code to perform softmax with a fixed amount of shared
-    memory.
-    On entry, `buf` is assumed to be empty.
-    On exit, `buf[0]` contains the softmax, `buf2` contains
-    un-normalized softmax.
-    Parameters
-    ----------
-    N
-        Length of the buffer, at least warpSize(32).
-    buf
-        A shared memory buffer of size warpSize * sizeof(dtype).
-    x
-        A ptr to the gpu memory where the row is stored.
-    stride_x
-        The stride between each element in x.
-    load_x
-        Wrapper to read from x.
-    sm
-        A ptr to the gpu memory to store the result.
-    sm_stride
-        The stride between each sm element.
-    write_sm
-        Wrapper before writing to sm.
-    threadPos
-        Index of executing thread.
-    threadCount
-        Number of executing threads.
-    b
-        Optional, pointer to the bias.
-    stride_b
-        Optional, the stride of b if b is provided.
-    load_b
-        Optional, wrapper to read from b if b is provided.
-    dtype
-        Optional, the dtype of the softmax's output if not float32.
-    Notes
-    -----
-    `buf` should be in gpu shared memory, we access it many times.
-    We use tx as an int variable in a loop.
-    """
-    ctype = gpuarray.dtype_to_ctype(dtype)
-    ret = [
-        # get max of buf (trashing all but buf[0])
-        inline_reduce_fixed_shared_max(
-            N,
-            buf,
-            x,
-            stride_x,
-            load_x,
-            threadPos,
-            threadCount,
-            b,
-            stride_b,
-            load_b,
-            dtype,
-        ),
-        "__syncthreads()",
-        f"{ctype} row_max = {buf}[0]",
-        "__syncthreads()",
-        inline_reduce_fixed_shared(
-            N,
-            buf,
-            x,
-            stride_x,
-            load_x,
-            threadPos,
-            threadCount,
-            lambda a, b: f"{a} + {b}",
-            lambda a: f"exp({a} - row_max)",
-            b,
-            stride_b,
-            load_b,
-            dtype,
-        ),
-        "__syncthreads()",
-        f"{ctype} row_sum = {buf}[0]",
-        "__syncthreads()",
-        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-    ]
-    # This set all value correctly
-    if b:
-        ret += [
-            "%(sm)s[tx * %(sm_stride)s] = "
-            "  %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
-            "            %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
-            " / row_sum)" % locals()
-        ]
-    else:
-        ret += [
-            "%(sm)s[tx * %(sm_stride)s] = "
-            "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
-            " / row_sum)" % locals()
-        ]
-    ret += [
-        "}",
-        "__syncthreads()",
-    ]
-    return ret
--- a/aesara/gpuarray/linalg.py
+++ b/aesara/gpuarray/linalg.py
-import warnings
-import numpy as np
-import pkg_resources
-from numpy.linalg.linalg import LinAlgError
-from aesara.configdefaults import config
-from aesara.gpuarray.basic_ops import (
-    CGpuKernelBase,
-    as_gpuarray_variable,
-    gpu_contiguous,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.type import GpuArrayType, gpu_context_type
-from aesara.graph.basic import Apply
-from aesara.graph.op import Op
-from aesara.link.c.op import ExternalCOp
-from aesara.link.c.params_type import ParamsType
-from aesara.scalar import bool as bool_t
-from aesara.tensor import basic as at
-from aesara.tensor import math as tm
-try:
-    import pygpu
-    from pygpu.basic import tril, triu
-    pygpu_available = True
-except ImportError:
-    pygpu_available = False
-cusolver_available = False
-try:
-    import skcuda
-    from skcuda import cusolver
-    cusolver_available = True
-except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
-    pass
-cublas_available = False
-try:
-    from skcuda import cublas
-    cublas_available = True
-except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
-    pass
-if cusolver_available:
-    # Add cusolver call as it is missing in skcuda
-    # SPOTRS
-    cusolver._libcusolver.cusolverDnSpotrs.restype = int
-    cusolver._libcusolver.cusolverDnSpotrs.argtypes = [
-        cusolver.ctypes.c_void_p,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_void_p,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_void_p,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_void_p,
-    ]
-    def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo):
-        """
-        Solve real single precision linear system for hermitian matrices.
-        References
-        ----------
-        `cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
-        """
-        status = cusolver._libcusolver.cusolverDnSpotrs(
-            handle, uplo, n, nrhs, int(A), lda, int(B), ldb, int(devInfo)
-        )
-        cusolver.cusolverCheckStatus(status)
-    # DPOTRS
-    # TODO: Are they still missing in skucda?
-    cusolver._libcusolver.cusolverDnDpotrs.restype = int
-    cusolver._libcusolver.cusolverDnDpotrs.argtypes = [
-        cusolver.ctypes.c_void_p,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_void_p,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_void_p,
-        cusolver.ctypes.c_int,
-        cusolver.ctypes.c_void_p,
-    ]
-    def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo):
-        status = cusolver._libcusolver.cusolverDnDpotrs(
-            handle, uplo, n, nrhs, int(A), lda, int(B), ldb, int(devInfo)
-        )
-        cusolver.cusolverCheckStatus(status)
-def attach_cusolver_handle_to_context(ctx):
-    handle = getattr(ctx, "cusolver_handle", None)
-    if handle is None:
-        with ctx:
-            ctx.cusolver_handle = cusolver.cusolverDnCreate()
-def attach_cublas_handle_to_context(ctx):
-    handle = getattr(ctx, "cublas_handle", None)
-    if handle is None:
-        with ctx:
-            ctx.cublas_handle = cublas.cublasCreate()
-# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
-MATRIX_STRUCTURES_SOLVE = (
-    "general",
-    "symmetric",
-    "lower_triangular",
-    "upper_triangular",
-)
-class GpuCusolverSolve(Op):
-    """
-    CUSOLVER GPU solver OP.
-    Parameters
-    ----------
-    trans
-        Whether to take the transpose of the input matrix or not.
-    """
-    __props__ = ("A_structure", "trans", "inplace")
-    def __init__(self, A_structure="general", trans="N", inplace=False):
-        self.trans = trans
-        self.inplace = inplace
-        self.A_structure = A_structure
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-        assert A_structure in MATRIX_STRUCTURES_SOLVE
-        super().__init__()
-    def make_node(self, inp1, inp2):
-        if not cusolver_available:
-            raise RuntimeError(
-                "CUSOLVER is not available and "
-                "GpuCusolverSolve Op can not be constructed."
-            )
-        if skcuda.__version__ <= "0.5.1":
-            warnings.warn(
-                "The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8"
-            )
-        context_name = infer_context_name(inp1, inp2)
-        inp1 = as_gpuarray_variable(inp1, context_name)
-        inp2 = as_gpuarray_variable(inp2, context_name)
-        inp1 = gpu_contiguous(inp1)
-        inp2 = gpu_contiguous(inp2)
-        assert inp1.ndim == 2
-        assert inp2.ndim == 2
-        assert inp1.dtype == inp2.dtype
-        return Apply(
-            self,
-            [inp1, inp2],
-            [
-                GpuArrayType(
-                    inp1.dtype,
-                    broadcastable=inp1.broadcastable,
-                    context_name=context_name,
-                )()
-            ],
-        )
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        ctx = node.inputs[0].type.context
-        attach_cusolver_handle_to_context(ctx)
-    def check_dev_info(self, dev_info):
-        val = np.asarray(dev_info)[0]
-        if val > 0:
-            raise LinAlgError("A is singular")
-    def perform(self, node, inputs, outputs):
-        context = inputs[0][0].context
-        # Size of the matrices to invert.
-        z = outputs[0]
-        # Matrix.
-        A = inputs[0]
-        # Solution vectors.
-        b = inputs[1]
-        assert len(A.shape) == 2
-        assert len(b.shape) == 2
-        if self.trans in ("T", "C"):
-            trans = 1
-            l, n = A.shape
-            k, m = b.shape
-        elif self.trans == "N":
-            trans = 0
-            n, l = A.shape
-            k, m = b.shape
-        else:
-            raise ValueError("Invalid value for trans")
-        if l != n:
-            raise ValueError("A must be a square matrix")
-        if n != k:
-            raise ValueError("A and b must be aligned.")
-        lda = max(1, n)
-        ldb = max(1, k)
-        # We copy A and b as cusolver operates inplace
-        b = pygpu.array(b, copy=True, order="F")
-        if not self.inplace:
-            A = pygpu.array(A, copy=True)
-        A_ptr = A.gpudata
-        b_ptr = b.gpudata
-        # cusolver expects a F ordered matrix, but A is not explicitly
-        # converted between C and F order, instead we switch the
-        # "transpose" flag.
-        if A.flags["C_CONTIGUOUS"]:
-            trans = 1 - trans
-        if A.dtype == "float32":
-            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
-            potrf = cusolver.cusolverDnSpotrf
-            potrs = cusolverDnSpotrs
-            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
-            getrf = cusolver.cusolverDnSgetrf
-            getrs = cusolver.cusolverDnSgetrs
-        elif A.dtype == "float64":
-            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
-            potrf = cusolver.cusolverDnDpotrf
-            potrs = cusolverDnDpotrs
-            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
-            getrf = cusolver.cusolverDnDgetrf
-            getrs = cusolver.cusolverDnDgetrs
-        else:
-            raise ValueError("Unsupported dtype")
-        if self.A_structure == "symmetric":
-            with context:
-                workspace_size = potrf_bufferSize(
-                    context.cusolver_handle, 0, n, A_ptr, lda
-                )
-            workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context)
-            dev_info = pygpu.zeros((1,), dtype="int32", context=context)
-            workspace_ptr = workspace.gpudata
-            dev_info_ptr = dev_info.gpudata
-            with context:
-                potrf(
-                    context.cusolver_handle,
-                    0,
-                    n,
-                    A_ptr,
-                    lda,
-                    workspace_ptr,
-                    workspace_size,
-                    dev_info_ptr,
-                )
-                self.check_dev_info(dev_info)
-                potrs(
-                    context.cusolver_handle,
-                    0,
-                    n,
-                    m,
-                    A_ptr,
-                    lda,
-                    b_ptr,
-                    ldb,
-                    dev_info_ptr,
-                )
-        else:
-            # general case for A
-            with context:
-                workspace_size = getrf_bufferSize(
-                    context.cusolver_handle, n, n, A_ptr, lda
-                )
-            workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context)
-            pivots = pygpu.zeros(n, dtype="int32", context=context)
-            dev_info = pygpu.zeros((1,), dtype="int32", context=context)
-            workspace_ptr = workspace.gpudata
-            pivots_ptr = pivots.gpudata
-            dev_info_ptr = dev_info.gpudata
-            with context:
-                getrf(
-                    context.cusolver_handle,
-                    n,
-                    n,
-                    A_ptr,
-                    lda,
-                    workspace_ptr,
-                    pivots_ptr,
-                    dev_info_ptr,
-                )
-                self.check_dev_info(dev_info)
-                getrs(
-                    context.cusolver_handle,
-                    trans,
-                    n,
-                    m,
-                    A_ptr,
-                    lda,
-                    pivots_ptr,
-                    b_ptr,
-                    ldb,
-                    dev_info_ptr,
-                )
-        z[0] = b
-    def L_op(self, inputs, outputs, output_gradients):
-        # Modified from aesara/tensor/slinalg.py
-        A, b = inputs
-        c = outputs[0]
-        c_bar = output_gradients[0]
-        # FIXME: triangular structure would use GpuCublasTriangularsolve?
-        # no need to handle A_structure like slinalg.py?
-        trans_solve_op = GpuCusolverSolve("general")
-        b_bar = trans_solve_op(A.T, c_bar)
-        A_bar = -tm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
-        return [A_bar, b_bar]
-class GpuCublasTriangularSolve(Op):
-    """
-    CUBLAS GPU Triangular Solve Op.
-    Parameters
-    ----------
-    lower
-        Whether system is lower-triangular (True) or upper-triangular (False).
-    trans
-        Whether to take the transpose of the input matrix or not.
-    """
-    __props__ = ("trans", "lower")
-    def __init__(self, lower=True, trans="N"):
-        self.trans = trans
-        self.lower = lower
-        super().__init__()
-    def make_node(self, inp1, inp2):
-        if not cublas_available:
-            raise RuntimeError(
-                "CUBLAS is not available and "
-                "GpuCublasTriangularSolve Op "
-                "can not be constructed."
-            )
-        context_name = infer_context_name(inp1, inp2)
-        inp1 = as_gpuarray_variable(inp1, context_name)
-        inp2 = as_gpuarray_variable(inp2, context_name)
-        inp1 = gpu_contiguous(inp1)
-        inp2 = gpu_contiguous(inp2)
-        assert inp1.ndim == 2
-        assert inp2.ndim in (1, 2)
-        assert inp1.dtype == inp2.dtype
-        return Apply(
-            self,
-            [inp1, inp2],
-            [
-                GpuArrayType(
-                    inp1.dtype,
-                    broadcastable=inp2.broadcastable,
-                    context_name=context_name,
-                )()
-            ],
-        )
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        ctx = node.inputs[0].type.context
-        attach_cublas_handle_to_context(ctx)
-    def perform(self, node, inputs, outputs):
-        ctx = node.inputs[0].type.context
-        # Solution set
-        x = outputs[0]
-        # Matrix.
-        A = inputs[0]
-        # right hand side
-        b = inputs[1]
-        assert len(A.shape) == 2
-        assert len(b.shape) in (1, 2)
-        # implicitly deal with the difference between C order
-        # and fortran order by flipping the trans and lower flags
-        lower = not self.lower
-        trans = self.trans
-        if trans in ("T", "C"):
-            trans = "N"
-            l, n = A.shape
-        elif trans == "N":
-            trans = "T"
-            n, l = A.shape
-        else:
-            raise ValueError("Invalid value for trans")
-        if b.ndim == 2:
-            k, m = b.shape
-        else:
-            (k,) = b.shape
-            m = 1
-        if l != n:
-            raise ValueError("A must be a square matrix")
-        if n != k:
-            raise ValueError("A and b must be aligned.")
-        lda = max(1, n)
-        ldb = max(1, k)
-        # solution overwrites right hand side on exit
-        b = pygpu.array(b, copy=True, order="F")
-        A_ptr = A.gpudata
-        b_ptr = b.gpudata
-        # unit scalar used for multiplication
-        alpha = 1.0
-        # indicates matrix A is on left of B
-        side = "l"
-        # set whether upper or lower part of matrix A stored
-        uplo = "l" if lower else "u"
-        # indicates elements on diagonal of matrix A may not be unity
-        diag = "n"
-        if A.dtype == "float32":
-            trsv = cublas.cublasStrsv
-            trsm = cublas.cublasStrsm
-        elif A.dtype == "float64":
-            trsv = cublas.cublasDtrsv
-            trsm = cublas.cublasDtrsm
-        else:
-            raise ValueError("Unsupported dtype")
-        with ctx:
-            if b.ndim == 1:
-                # matrix vector solve
-                trsv(ctx.cublas_handle, uplo, trans, diag, n, A_ptr, lda, b_ptr, 1)
-            else:
-                trsm(
-                    ctx.cublas_handle,
-                    side,
-                    uplo,
-                    trans,
-                    diag,
-                    n,
-                    m,
-                    alpha,
-                    A_ptr,
-                    lda,
-                    b_ptr,
-                    ldb,
-                )
-        x[0] = b
-    def L_op(self, inputs, outputs, output_gradients):
-        # Modified from aesara/tensor/slinalg.py
-        A, b = inputs
-        c = outputs[0]
-        c_bar = output_gradients[0]
-        trans_solve_op = GpuCublasTriangularSolve(not self.lower)
-        b_bar = trans_solve_op(A.T, c_bar)
-        A_bar = -tm.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
-        if self.lower:
-            A_bar = at.tril(A_bar)
-        else:
-            A_bar = at.triu(A_bar)
-        return [A_bar, b_bar]
-def gpu_solve(A, b, A_structure="general", trans="N"):
-    if A_structure == "lower":
-        return GpuCublasTriangularSolve(True, trans)(A, b)
-    elif A_structure == "upper":
-        return GpuCublasTriangularSolve(False, trans)(A, b)
-    return GpuCusolverSolve(A_structure, trans)(A, b)
-def gpu_solve_lower_triangular(A, b, trans="N"):
-    return GpuCublasTriangularSolve(True, trans)(A, b)
-def gpu_solve_upper_triangular(A, b, trans="N"):
-    return GpuCublasTriangularSolve(False, trans)(A, b)
-class GpuCholesky(Op):
-    """
-    CUSOLVER GPU Cholesky Op.
-    Given a real positive definite matrix `A` returns either a lower
-    triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
-    else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
-    if `lower == False`.
-    Parameters
-    ----------
-    lower
-        Whether to return a lower rather than upper triangular decomposition.
-    """
-    __props__ = ("lower", "inplace")
-    def __init__(self, lower=True, inplace=False):
-        self.lower = lower
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-        super().__init__()
-    def clone_inplace(self):
-        return self.__class__(lower=self.lower, inplace=True)
-    def make_node(self, inp):
-        if not cusolver_available:
-            raise RuntimeError(
-                "CUSOLVER is not available and "
-                "GpuCholesky Op can not be constructed."
-            )
-        if skcuda.__version__ <= "0.5.1":
-            warnings.warn(
-                "The GpuCholesky op requires scikit-cuda > " "0.5.1 to work with CUDA 8"
-            )
-        if not pygpu_available:
-            raise RuntimeError(
-                "Missing pygpu or triu/tril functions." "Install or update libgpuarray."
-            )
-        context_name = infer_context_name(inp)
-        inp = as_gpuarray_variable(inp, context_name)
-        inp = gpu_contiguous(inp)
-        assert inp.ndim == 2
-        return Apply(self, [inp], [inp.type()])
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        ctx = node.inputs[0].type.context
-        attach_cusolver_handle_to_context(ctx)
-    def perform(self, node, inputs, outputs):
-        context = inputs[0][0].context
-        # Input matrix.
-        A = inputs[0]
-        l, n = A.shape
-        if l != n:
-            raise ValueError("A must be a square matrix")
-        lda = max(1, n)
-        # cusolver operates on F ordered matrices, but A is expected
-        # to be symmetric so it does not matter.
-        # We copy A if needed
-        if self.inplace:
-            L = A
-        else:
-            L = pygpu.array(A, copy=True)
-        # The output matrix will contain only the upper or lower
-        # triangular factorization of A. If L is C ordered (it
-        # probably is as it is the default in Aesara) we just switch
-        # the fill mode parameter of cusolver
-        l_parameter = 0 if self.lower else 1
-        if L.flags["C_CONTIGUOUS"]:
-            l_parameter = 1 - l_parameter
-        L_ptr = L.gpudata
-        if A.dtype == "float32":
-            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
-            potrf = cusolver.cusolverDnSpotrf
-        elif A.dtype == "float64":
-            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
-            potrf = cusolver.cusolverDnDpotrf
-        else:
-            raise ValueError("Unsupported dtype")
-        with context:
-            workspace_size = potrf_bufferSize(
-                context.cusolver_handle, l_parameter, n, L_ptr, lda
-            )
-            workspace = pygpu.zeros(workspace_size, dtype=A.dtype, context=context)
-            dev_info = pygpu.zeros((1,), dtype="int32", context=context)
-            workspace_ptr = workspace.gpudata
-            dev_info_ptr = dev_info.gpudata
-            potrf(
-                context.cusolver_handle,
-                l_parameter,
-                n,
-                L_ptr,
-                lda,
-                workspace_ptr,
-                workspace_size,
-                dev_info_ptr,
-            )
-            val_dev_info = np.asarray(dev_info)[0]
-            if val_dev_info > 0:
-                raise LinAlgError("Cholesky decomposition failed (is A SPD?)")
-        # cusolver leaves the elements in the matrix outside the considered
-        # upper or lower triangle unchanged, so we need to put zeros outside
-        # the triangle
-        if self.lower:
-            tril(L)
-        else:
-            triu(L)
-        outputs[0][0] = L
-    def L_op(self, inputs, outputs, gradients):
-        # Modified from aesara/tensor/slinalg.py
-        # No handling for on_error = 'nan'
-        dz = gradients[0]
-        chol_x = outputs[0]
-        # this is for nan mode
-        #
-        # ok = ~tm.any(tm.isnan(chol_x))
-        # chol_x = at.switch(ok, chol_x, 1)
-        # dz = at.switch(ok, dz, 1)
-        # deal with upper triangular by converting to lower triangular
-        if not self.lower:
-            chol_x = chol_x.T
-            dz = dz.T
-        def tril_and_halve_diagonal(mtx):
-            """Extracts lower triangle of square matrix and halves diagonal."""
-            return at.tril(mtx) - at.diag(at.diagonal(mtx) / 2.0)
-        def conjugate_solve_triangular(outer, inner):
-            """Computes L^{-T} P L^{-1} for lower-triangular L."""
-            return gpu_solve_upper_triangular(
-                outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T
-            )
-        s = conjugate_solve_triangular(
-            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))
-        )
-        if self.lower:
-            grad = at.tril(s + s.T) - at.diag(at.diagonal(s))
-        else:
-            grad = at.triu(s + s.T) - at.diag(at.diagonal(s))
-        return [grad]
-def gpu_cholesky(A, lower=True):
-    return GpuCholesky(lower)(A)
-# TODO: add support for float64
-class GpuMagmaBase(ExternalCOp):
-    """Base class for magma related operations. Add the necessary headers,
-    libraries and optionally the location of headers and library.
-    """
-    def c_headers(self, **kwargs):
-        return [
-            "gpuarray/types.h",
-            "gpuarray/array.h",
-            "gpuarray/ext_cuda.h",
-            "gpuarray_helper.h",
-            "magma.h",
-        ]
-    def c_header_dirs(self, **kwargs):
-        dirs = [
-            gpuarray_helper_inc_dir(),
-            pygpu.get_include(),
-            config.cuda__include_path,
-        ]
-        if config.magma__include_path:
-            dirs.append(config.magma__include_path)
-        return dirs
-    def c_libraries(self, **kwargs):
-        return ["magma"]
-    def c_lib_dirs(self, **kwargs):
-        if config.magma__library_path:
-            return [config.magma__library_path]
-        return []
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        from skcuda.magma import magma_init
-        ctx = node.inputs[0].type.context
-        if not getattr(ctx, "is_magma_initialized", False):
-            with ctx:
-                magma_init()
-                ctx.is_magma_initialized = True
-class GpuMagmaSVD(GpuMagmaBase):
-    """Computes the svd of a matrix :math:`A` using magma library.
-    .. warning::
-        Because of implementation constraints, this Op returns outputs
-        in order ``S, U, VT``. Use :func:`aesara.gpuarray.linalg.gpu_svd`
-        to get them in expected order ``U, S, VT``.
-    """
-    __props__ = ("full_matrices", "compute_uv")
-    _cop_num_inputs = 1
-    _cop_num_outputs = 3
-    check_input = False
-    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
-    def __init__(self, full_matrices=True, compute_uv=True):
-        self.full_matrices = full_matrices
-        self.compute_uv = compute_uv
-        ExternalCOp.__init__(self, ["c_code/magma_svd.c"], "APPLY_SPECIFIC(magma_svd)")
-    def make_node(self, A):
-        ctx_name = infer_context_name(A)
-        A = as_gpuarray_variable(A, ctx_name)
-        A = gpu_contiguous(A)
-        if A.ndim != 2:
-            raise LinAlgError("Matrix rank error")
-        if A.dtype != "float32":
-            raise TypeError("only `float32` is supported for now")
-        if self.compute_uv:
-            return Apply(
-                self,
-                [A],
-                # return S, U, VT
-                [
-                    GpuArrayType(
-                        A.dtype, broadcastable=[False], context_name=ctx_name
-                    )(),
-                    A.type(),
-                    A.type(),
-                ],
-            )
-        else:
-            return Apply(
-                self,
-                [A],
-                # return only S
-                [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()],
-            )
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        super().prepare_node(node, storage_map, compute_map, impl)
-        # Check node to prevent eventual errors with old pickled nodes.
-        if self.compute_uv:
-            A, B, C = node.outputs
-            # We expect order: S (vector), U (matrix), VT (matrix)
-            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, (
-                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) "
-                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
-            )
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def infer_shape(self, fgraph, node, shapes):
-        (x_shape,) = shapes
-        M, N = x_shape
-        K = tm.minimum(M, N)
-        s_shape = (K,)
-        if self.compute_uv:
-            u_shape = (M, M) if self.full_matrices else (M, K)
-            vt_shape = (N, N) if self.full_matrices else (K, N)
-            return [s_shape, u_shape, vt_shape]
-        else:
-            return [s_shape]
-def gpu_svd(a, full_matrices=1, compute_uv=1):
-    """
-    This function performs the SVD on GPU.
-    Parameters
-    ----------
-    full_matrices : bool, optional
-        If True (default), u and v have the shapes (M, M) and (N, N),
-        respectively.
-        Otherwise, the shapes are (M, K) and (K, N), respectively,
-        where K = min(M, N).
-    compute_uv : bool, optional
-        Whether or not to compute u and v in addition to s.
-        True by default.
-    Returns
-    -------
-    U, V,  D : matrices
-    """
-    out = GpuMagmaSVD(full_matrices, compute_uv)(a)
-    if compute_uv:
-        S, U, VT = out
-        out = [U, S, VT]
-    return out
-class GpuMagmaMatrixInverse(GpuMagmaBase):
-    """Computes the inverse of a matrix :math:`A` using magma library."""
-    __props__ = ("inplace",)
-    check_input = False
-    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
-    def __init__(self, inplace=False):
-        ExternalCOp.__init__(self, ["c_code/magma_inv.c"], "APPLY_SPECIFIC(magma_inv)")
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def clone_inplace(self):
-        return self.__class__(inplace=True)
-    def make_node(self, A):
-        ctx_name = infer_context_name(A)
-        A = as_gpuarray_variable(A, ctx_name)
-        A = gpu_contiguous(A)
-        if A.ndim != 2:
-            raise LinAlgError("Matrix rank error")
-        if A.dtype != "float32":
-            raise TypeError("only `float32` is supported for now")
-        return Apply(self, [A], [A.type()])
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def infer_shape(self, fgraph, node, shapes):
-        return shapes
-def gpu_matrix_inverse(a):
-    """
-    This function performs the matrix inverse on GPU.
-    Returns
-    -------
-    a_inv: matrix
-    """
-    return GpuMagmaMatrixInverse()(a)
-class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase):
-    """Computes the cholesky decomposition of a matrix :math:`A` using magma
-    library.
-    """
-    __props__ = ("lower", "inplace")
-    check_input = False
-    params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type)
-    def __init__(self, lower=True, inplace=False):
-        self.lower = lower
-        ExternalCOp.__init__(
-            self, ["c_code/magma_cholesky.c"], "APPLY_SPECIFIC(magma_cholesky)"
-        )
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def clone_inplace(self):
-        return self.__class__(lower=self.lower, inplace=True)
-    def make_node(self, A):
-        ctx_name = infer_context_name(A)
-        A = as_gpuarray_variable(A, ctx_name)
-        A = gpu_contiguous(A)
-        if A.ndim != 2:
-            raise LinAlgError("Matrix rank error")
-        if A.dtype != "float32":
-            raise TypeError("only `float32` is supported for now")
-        return Apply(self, [A], [A.type()])
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def infer_shape(self, fgraph, node, shapes):
-        return [shapes[0]]
-class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
-    """Computes the qr decomposition of a matrix :math:`A` using magma
-    library.
-    Parameters
-    ----------
-        complete : If False, returns only ``R``.
-    .. warning::
-        Because of implementation constraints, this Op returns outputs
-        in order ``R, Q``. Use :func:`aesara.gpuarray.linalg.gpu_qr`
-        to get them in expected order ``Q, R``.
-    """
-    __props__ = ("complete",)
-    _cop_num_inputs = 1
-    _cop_num_outputs = 2
-    check_input = False
-    params_type = ParamsType(complete=bool_t, context=gpu_context_type)
-    def __init__(self, complete=True):
-        self.complete = complete
-        ExternalCOp.__init__(self, ["c_code/magma_qr.c"], "APPLY_SPECIFIC(magma_qr)")
-    def make_node(self, A):
-        ctx_name = infer_context_name(A)
-        A = as_gpuarray_variable(A, ctx_name)
-        A = gpu_contiguous(A)
-        if A.ndim != 2:
-            raise LinAlgError("Matrix rank error")
-        if A.dtype != "float32":
-            raise TypeError("only `float32` is supported for now")
-        if self.complete:
-            return Apply(
-                self,
-                [A],
-                # return R, Q
-                [A.type(), A.type()],
-            )
-        else:
-            return Apply(
-                self,
-                [A],
-                # return R
-                [A.type()],
-            )
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-def gpu_qr(a, complete=True):
-    """
-    This function performs the QR on GPU.
-    Parameters
-    ----------
-    complete : bool, optional
-        If `False`, returns only r.
-    Returns
-    -------
-    Q, R : matrices
-    """
-    out = GpuMagmaQR(complete)(a)
-    if complete:
-        R, Q = out
-        out = [Q, R]
-    return out
-class GpuMagmaEigh(GpuMagmaBase):
-    """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
-    library.
-    Parameters
-    ----------
-    UPLO : Specifies whether the calculation is done with the lower triangular
-           part of matrix (`L`, default) or the upper triangular part (`U`).
-    compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
-                default). If `False`, computes only eigenvalues of matrix.
-    """
-    __props__ = ("lower", "compute_v")
-    _cop_num_inputs = 1
-    _cop_num_outputs = 2
-    check_input = False
-    params_type = ParamsType(lower=bool_t, compute_v=bool_t, context=gpu_context_type)
-    def __init__(self, UPLO="L", compute_v=True):
-        assert UPLO in ("L", "U")
-        self.lower = UPLO == "L"
-        self.compute_v = compute_v
-        ExternalCOp.__init__(
-            self, ["c_code/magma_eigh.c"], "APPLY_SPECIFIC(magma_eigh)"
-        )
-    def make_node(self, A):
-        ctx_name = infer_context_name(A)
-        A = as_gpuarray_variable(A, ctx_name)
-        A = gpu_contiguous(A)
-        if A.ndim != 2:
-            raise LinAlgError("Matrix rank error")
-        if A.dtype != "float32":
-            raise TypeError("only `float32` is supported for now")
-        if self.compute_v:
-            return Apply(
-                self,
-                [A],
-                # return D, V
-                [
-                    GpuArrayType(
-                        A.dtype, broadcastable=[False], context_name=ctx_name
-                    )(),
-                    A.type(),
-                ],
-            )
-        else:
-            return Apply(
-                self,
-                [A],
-                # return D
-                [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()],
-            )
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
--- a/aesara/gpuarray/multinomial.py
+++ b/aesara/gpuarray/multinomial.py
-# TODO test dtype != float32
-import warnings
-try:
-    import pygpu
-except ImportError:
-    pass
-import aesara
-import aesara.sandbox.multinomial
-from aesara.gpuarray.basic_ops import (
-    GpuKernelBaseCOp,
-    Kernel,
-    as_gpuarray_variable,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.elemwise import GpuDimShuffle
-from aesara.gpuarray.fp16_help import load_w, work_dtype, write_w
-from aesara.gpuarray.opt import op_lifter, register_opt, register_opt2
-from aesara.gpuarray.type import GpuArrayType
-from aesara.graph.basic import Apply
-from aesara.graph.op import _NoPythonOp
-from aesara.scalar import as_scalar
-from aesara.tensor.basic import get_scalar_constant_value
-from aesara.tensor.exceptions import NotScalarConstantError
-class GPUAMultinomialFromUniform(GpuKernelBaseCOp, _NoPythonOp):
-    __props__ = ("odtype",)
-    _f16_ok = True
-    def __init__(self, odtype):
-        super().__init__(self)
-        self.odtype = odtype
-    def get_params(self, node):
-        return node.outputs[0].type.context
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "gpuarray_helper.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def make_node(self, pvals, unis):
-        ctx_name = infer_context_name(pvals, unis)
-        pvals = as_gpuarray_variable(pvals, ctx_name)
-        unis = as_gpuarray_variable(unis, ctx_name)
-        valid_dtypes = ("float32", "float16", "float64")
-        assert pvals.dtype in valid_dtypes
-        assert unis.dtype in valid_dtypes
-        if pvals.ndim != 2:
-            raise NotImplementedError("pvals ndim should be 2", pvals.ndim)
-        if unis.ndim != 1:
-            raise NotImplementedError("unis ndim should be 1", unis.ndim)
-        if self.odtype == "auto":
-            odtype = pvals.dtype
-        else:
-            odtype = self.odtype
-        br = (pvals.broadcastable[1], pvals.broadcastable[0])
-        out = GpuArrayType(broadcastable=br, dtype=odtype, context_name=ctx_name)()
-        return Apply(self, [pvals, unis], [out])
-    def gpu_kernels(self, node, name):
-        out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
-        pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
-        unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
-        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
-        write_out_ctype = write_w(node.outputs[0].dtype)
-        load_in_ctype = load_w(node.inputs[0].dtype)
-        code = """#include "cluda.h"
-KERNEL void k_multi_warp_multinomial(
-    const ga_size nb_multi,
-    const ga_size nb_outcomes,
-    GLOBAL_MEM %(pvals_ctype)s *global_pvals,
-    const ga_size global_pvals_offset,
-    const ga_ssize pvals_row_stride,
-    const ga_ssize pvals_col_stride,
-    GLOBAL_MEM %(unis_ctype)s *global_unis,
-    const ga_size global_unis_offset,
-    const ga_ssize unis_stride,
-    GLOBAL_MEM %(out_ctype)s *global_outs,
-    const ga_size global_outs_offset,
-    const ga_ssize outs_row_stride,
-    const ga_ssize outs_col_stride
-)
-{
-    global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
-    global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
-    global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
-    // each thread takes care of one multinomial draw
-    int n = LDIM_0*GID_0 + LID_0;
-    if (n < nb_multi)
-    {
-        %(work_ctype)s cummul = 0.;
-        bool done = false;
-        const %(work_ctype)s unis_n = %(load_in_ctype)s(global_unis[n*unis_stride]);
-        for (ga_size m = 0; m < nb_outcomes; ++m)
-        {
-            %(work_ctype)s current_out = 0;
-            if (!done)
-            {
-                cummul += %(load_in_ctype)s(global_pvals[m * pvals_col_stride + n * pvals_row_stride]);
-                if (unis_n < cummul)
-                {
-                    current_out = 1;
-                    done = true;
-                }
-            }
-            //write out transposed for speed.
-            global_outs[n * outs_col_stride +
-                        m * outs_row_stride] = %(write_out_ctype)s(current_out);
-        }
-    }
-}
-""" % dict(
-            out_ctype=out_ctype,
-            write_out_ctype=write_out_ctype,
-            work_ctype=work_ctype,
-            pvals_ctype=pvals_ctype,
-            unis_ctype=unis_ctype,
-            load_in_ctype=load_in_ctype,
-        )
-        return [
-            Kernel(
-                code=code,
-                name="k_multi_warp_multinomial",
-                params=[
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.GpuArray,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.GpuArray,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.GpuArray,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.SSIZE,
-                ],
-                flags=Kernel.get_flags(node.outputs[0].dtype),
-                objvar="k_multi_warp_multinomial_" + name,
-            )
-        ]
-    def c_code(self, node, name, inp, outputs, sub):
-        pvals, unis = inp
-        (out,) = outputs
-        fail = sub["fail"]
-        ctx = sub["params"]
-        kname = self.gpu_kernels(node, name)[0].objvar
-        out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        pvals_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
-        unis_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
-        s = (
-            """
-        PyGpuArrayObject * pvals = %(pvals)s;
-        PyGpuArrayObject * unis = %(unis)s;
-        PyGpuArrayObject * out = %(out)s;
-    size_t dims[2];
-    if (PyGpuArray_NDIM(pvals) != 2)
-    {
-        PyErr_Format(PyExc_TypeError, "pvals wrong rank");
-        %(fail)s
-    }
-    if (PyGpuArray_NDIM(unis) != 1)
-    {
-        PyErr_Format(PyExc_TypeError, "unis wrong rank");
-        %(fail)s
-    }
-    if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
-    {
-        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
-        %(fail)s
-    }
-    dims[0] = PyGpuArray_DIMS(pvals)[1];
-    dims[1] = PyGpuArray_DIMS(pvals)[0];
-    if (aesara_prep_output(&out, 2, dims, %(out_typecode)s,
-                           GA_C_ORDER, %(ctx)s) != 0){
-      %(fail)s
-    }
-    %(out)s = out;
-    GpuArray_memset(&(out->ga), 0);
-    { // NESTED SCOPE
-        int nb_multi = PyGpuArray_DIMS(pvals)[0];
-        int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
-        //TODO : change this for a beautiful constant
-        int max_nb_blocks = 2<<15 - 1;
-        size_t nb_blocks = max_nb_blocks + 1;
-        size_t nb_threads=16; // so it really starts at 32, because of the *2
-        do
-        {
-            nb_threads*=2;
-            if (nb_multi %% nb_threads == 0)
-                nb_blocks = nb_multi/nb_threads;
-            else
-                nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
-        } while (nb_blocks > max_nb_blocks);
-        //printf("\\nN=%%i b=%%i t=%%i t*b=%%i",
-        //         nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
-        // TODO : next line is a bit hardcoded...
-        if (nb_threads > 512)
-        {
-            PyErr_Format(
-                PyExc_ValueError,
-                "Multinomial is not implemented for so many rows in the matrix (%%i)",
-                nb_multi);
-            %(fail)s
-        }
-        assert(nb_blocks*nb_threads >= nb_multi);
-        int err = k_multi_warp_multinomial_call(
-          1, &nb_blocks, &nb_threads, 0,
-          PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset,
-          PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(pvals_typecode)s),
-          PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(pvals_typecode)s),
-          unis->ga.data, unis->ga.offset,
-          PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(unis_typecode)s), out->ga.data,
-          out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
-          PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
-        if (err != GA_NO_ERROR) {
-           PyErr_Format(
-                PyExc_RuntimeError,
-                "gpuarray error: %%s: %%s.\\n",
-                "k_multi_warp_%(name)s",
-                GpuKernel_error(&%(kname)s, err));
-            %(fail)s;
-        }
-    } // END NESTED SCOPE
-        """
-            % locals()
-        )
-        return s
-    def c_code_cache_version(self):
-        return (7,)
-class GPUAChoiceFromUniform(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    The output is transposed compared to MultinomialWOReplacementFromUniform.
-    We must insert a Transpose op after it.
-    The optimization that moves it to the gpu does it.
-    """
-    __props__ = ("odtype", "replace")
-    def __init__(self, odtype, replace=False):
-        super().__init__(self)
-        self.odtype = odtype
-        self.replace = replace
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        if "replace" not in state:
-            self.replace = False
-    def get_params(self, node):
-        return node.outputs[0].type.context
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "gpuarray_helper.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def make_node(self, pvals, unis, n):
-        assert pvals.dtype == "float32"
-        assert unis.dtype == "float32"
-        ctx_name = infer_context_name(pvals, unis)
-        pvals = as_gpuarray_variable(pvals, ctx_name)
-        unis = as_gpuarray_variable(unis, ctx_name)
-        if pvals.ndim != 2:
-            raise NotImplementedError("pvals ndim should be 2", pvals.ndim)
-        if unis.ndim != 1:
-            raise NotImplementedError("unis ndim should be 1", unis.ndim)
-        if self.odtype == "auto":
-            odtype = "int64"
-        else:
-            odtype = self.odtype
-        assert odtype == "int64", odtype
-        br = (pvals.broadcastable[1], pvals.broadcastable[0])
-        out = GpuArrayType(broadcastable=br, dtype=odtype, context_name=ctx_name)()
-        return Apply(self, [pvals, unis, as_scalar(n)], [out])
-    def gpu_kernels(self, node, name):
-        replace = int(self.replace)
-        code = """#include "cluda.h"
-KERNEL void k_multi_warp_multinomial_wor(
-    const ga_size nb_multi,
-    const ga_size nb_outcomes,
-    const ga_size n_samples,
-    GLOBAL_MEM float * global_pvals_copy,
-    const ga_size global_pvals_offset,
-    const ga_ssize pvals_row_stride,
-    const ga_ssize pvals_col_stride,
-    GLOBAL_MEM float * global_unis,
-    const ga_size global_unis_offset,
-    const ga_ssize unis_stride,
-    GLOBAL_MEM ga_long * global_outs,
-    const ga_size global_outs_offset,
-    const ga_ssize outs_row_stride,
-    const ga_ssize outs_col_stride
-)
-{
-    global_pvals_copy = (GLOBAL_MEM float *)(((GLOBAL_MEM char *)global_pvals_copy) + global_pvals_offset);
-    global_unis = (GLOBAL_MEM float *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
-    global_outs = (GLOBAL_MEM ga_long *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
-    // each thread takes care of one multinomial-wor n_samples-draw
-    int n = LDIM_0*GID_0 + LID_0;
-    if (n < nb_multi)
-    {
-        // Sum of the remaining p_vals in global_pvals_copy[n]
-        float pvals_sum = 1.;
-        for (int c = 0; c < n_samples; ++c)
-        {
-            float cummul = 0.;
-            const float unis_n = global_unis[(c * nb_multi + n)*unis_stride] * pvals_sum;
-            for (ga_size m = 0; m < nb_outcomes; ++m)
-            {
-                float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
-                cummul += pvals_nm;
-                if (unis_n < cummul)
-                {
-                    // write out transposed for speed.
-                    global_outs[n * outs_col_stride +
-                                c * outs_row_stride] = m;
-                    if (! %(replace)s )
-                    {
-                        global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
-                        pvals_sum -= pvals_nm;
-                    }
-                    break;
-                }
-            }
-        }
-    }
-}
-""" % {
-            "replace": replace
-        }
-        return [
-            Kernel(
-                code=code,
-                name="k_multi_warp_multinomial_wor",
-                params=[
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.GpuArray,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.GpuArray,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.GpuArray,
-                    pygpu.gpuarray.SIZE,
-                    pygpu.gpuarray.SSIZE,
-                    pygpu.gpuarray.SSIZE,
-                ],
-                flags=Kernel.get_flags(node.outputs[0].dtype),
-                objvar="k_multi_warp_multinomial_wor_" + name,
-            )
-        ]
-    def c_code(self, node, name, inp, outputs, sub):
-        pvals, unis, n = inp
-        (out,) = outputs
-        replace = int(self.replace)
-        fail = sub["fail"]
-        ctx = sub["params"]
-        kname = self.gpu_kernels(node, name)[0].objvar
-        s = (
-            """
-    PyGpuArrayObject * pvals = %(pvals)s;
-    PyGpuArrayObject * unis = %(unis)s;
-    const size_t n_samples = %(n)s;
-    PyGpuArrayObject * out = %(out)s;
-    // create a copy of pvals matrix
-    PyGpuArrayObject * pvals_copy = NULL;
-    size_t dims[2];
-    if (PyGpuArray_NDIM(pvals) != 2)
-    {
-        PyErr_Format(PyExc_TypeError, "pvals wrong rank");
-        %(fail)s
-    }
-    if (PyGpuArray_NDIM(unis) != 1)
-    {
-        PyErr_Format(PyExc_TypeError, "unis wrong rank");
-        %(fail)s
-    }
-    if ( n_samples > (PyGpuArray_DIMS(pvals)[1]) )
-    {
-        PyErr_Format(PyExc_ValueError, "Cannot sample without replacement n samples bigger than the size of the distribution.");
-        %(fail)s;
-    }
-    if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0] * n_samples)
-    {
-        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
-        %(fail)s
-    }
-    if (! %(replace)s) {
-        pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
-    } else {
-        pvals_copy = pvals;
-        Py_INCREF(pvals_copy);
-    }
-    dims[0] = n_samples;
-    dims[1] = PyGpuArray_DIMS(pvals)[0];
-    if (aesara_prep_output(&out, 2, dims, GA_LONG,
-                           GA_C_ORDER, %(ctx)s) != 0){
-        Py_DECREF(pvals_copy);
-        %(fail)s
-    }
-    %(out)s = out;
-    { // NESTED SCOPE
-        int nb_multi = PyGpuArray_DIMS(pvals)[0];
-        int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
-        //TODO : change this for a beautiful constant
-        int max_nb_blocks = 2<<15 - 1;
-        size_t nb_blocks = max_nb_blocks + 1;
-        size_t nb_threads=16; // so it really starts at 32, because of the *2
-        do
-        {
-            nb_threads*=2;
-            if (nb_multi %% nb_threads == 0)
-                nb_blocks = nb_multi/nb_threads;
-            else
-                nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
-        } while (nb_blocks > max_nb_blocks);
-        // TODO : next line is a bit hardcoded...
-        if (nb_threads > 512)
-        {
-            PyErr_Format(
-                PyExc_ValueError,
-                "Multinomial is not implemented for so many rows in the matrix (%%i)",
-                nb_multi);
-            Py_DECREF(pvals_copy);
-            %(fail)s
-        }
-        assert(nb_blocks*nb_threads >= nb_multi);
-        int err = k_multi_warp_multinomial_wor_call(1, &nb_blocks, &nb_threads, 0, PyGpuArray_DIMS(pvals)[0], PyGpuArray_DIMS(pvals)[1], n_samples, pvals_copy->ga.data, pvals_copy->ga.offset, PyGpuArray_STRIDES(pvals)[0]/sizeof(float), PyGpuArray_STRIDES(pvals)[1]/sizeof(float), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/sizeof(float), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/8, PyGpuArray_STRIDES(out)[1]/8);
-        if (err != GA_NO_ERROR) {
-           PyErr_Format(
-                PyExc_RuntimeError,
-                "gpuarray error: %%s: %%s.\\n",
-                "k_multi_warp_%(name)s",
-                GpuKernel_error(&%(kname)s, err));
-           Py_DECREF(pvals_copy);
-           %(fail)s;
-        }
-        Py_DECREF(pvals_copy);
-    } // END NESTED SCOPE
-        """
-            % locals()
-        )
-        return s
-    def c_code_cache_version(self):
-        return (10,)
-@register_opt("fast_compile")
-@op_lifter([aesara.sandbox.multinomial.MultinomialFromUniform])
-@register_opt2([aesara.sandbox.multinomial.MultinomialFromUniform], "fast_compile")
-def local_gpua_multinomial(op, context_name, inputs, outputs):
-    # TODO : need description for function
-    if len(inputs) == 2:
-        p, u = inputs
-        n_samples = 1
-    else:
-        p, u, n_samples = inputs
-    try:
-        if get_scalar_constant_value(n_samples) != 1:
-            return None
-    except NotScalarConstantError:
-        return None
-    (m,) = outputs
-    gpu_op = GPUAMultinomialFromUniform(op.odtype)
-    return GpuDimShuffle([False, False], [1, 0])(gpu_op(p, u))
-@register_opt("fast_compile")
-@op_lifter([aesara.sandbox.multinomial.ChoiceFromUniform])
-@register_opt2([aesara.sandbox.multinomial.ChoiceFromUniform], "fast_compile")
-def local_gpua_multinomial_wor(op, context_name, inputs, outputs):
-    # TODO : need description for function
-    p, u, n = inputs
-    (m,) = outputs
-    if (p.dtype == u.dtype == "float32") and (m.dtype == "int64"):
-        gpu_op = GPUAChoiceFromUniform(**op._props_dict())
-        return GpuDimShuffle([False, False], [1, 0])(gpu_op(p, u, n))
-class GPUAMultinomialWOReplacementFromUniform(GPUAChoiceFromUniform):
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            "GPUAMultinomialWOReplacementFromUniform is deprecated, "
-            "use GPUAChoiceFromUniform instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        super().__init__(*args, **kwargs)
--- a/aesara/gpuarray/neighbours.py
+++ b/aesara/gpuarray/neighbours.py
-import aesara.tensor as at
-from aesara.graph.basic import Apply
-from aesara.graph.op import _NoPythonOp
-from aesara.link.c.params_type import ParamsType
-from aesara.tensor.nnet.neighbours import Images2Neibs
-from aesara.tensor.type import integer_dtypes
-try:
-    from pygpu import gpuarray
-except ImportError:
-    pass
-from aesara.gpuarray.basic_ops import (
-    GpuKernelBaseCOp,
-    Kernel,
-    as_gpuarray_variable,
-    infer_context_name,
-)
-from aesara.gpuarray.type import GpuArrayType, gpu_context_type
-class GpuImages2Neibs(GpuKernelBaseCOp, Images2Neibs, _NoPythonOp):
-    """
-    Images2Neibs for the GPU.
-    """
-    params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def make_node(self, ten4, neib_shape, neib_step=None):
-        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
-        neib_shape = at.as_tensor_variable(neib_shape)
-        if neib_step is None:
-            neib_step = neib_shape
-        else:
-            neib_step = at.as_tensor_variable(neib_step)
-        assert ten4.ndim == 4
-        assert neib_shape.ndim == 1
-        assert neib_step.ndim == 1
-        assert neib_shape.dtype in integer_dtypes
-        assert neib_step.dtype in integer_dtypes
-        return Apply(
-            self,
-            [ten4, neib_shape, neib_step],
-            [
-                GpuArrayType(
-                    broadcastable=(False, False),
-                    dtype=ten4.type.dtype,
-                    context_name=ten4.type.context_name,
-                )()
-            ],
-        )
-    def c_code_cache_version(self):
-        return (14,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray/types.h>"]
-    def gpu_kernels(self, node, nodename):
-        dtype_ten4 = node.inputs[0].dtype
-        dtype_z = node.outputs[0].dtype
-        flags = Kernel.get_flags(dtype_ten4, dtype_z)
-        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
-        type_z = gpuarray.dtype_to_ctype(dtype_z)
-        # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
-        mode_constants = self.BORDER_MODE.c_support_code()
-        kernels = []
-        kname = "k_multi_warp_less"
-        k_var = "k_multi_warp_less_" + nodename
-        code = """#include "cluda.h"
-        // a version that uses less registers but doesn't work in all cases.
-        %(mode_constants)s
-        KERNEL void %(kname)s(
-            const ga_int mode,
-            const ga_int nb_batch,
-            const ga_int nb_stack,
-            const ga_int height,
-            const ga_int width,
-            const ga_int c,
-            const ga_int d,
-            const ga_int step_x,
-            const ga_int step_y,
-            const ga_int grid_c,
-            const ga_int grid_d,
-            const ga_size stride0, const ga_size stride1,
-            const ga_size stride2, const ga_size stride3,
-            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
-            const ga_size out_s0, const ga_size out_s1,
-            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
-        )
-        {
-            const ga_int wrap_centered_half_idx_shift_x = c/2;
-            const ga_int wrap_centered_half_idx_shift_y = d/2;
-            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
-            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
-            for(ga_int tblock = GID_0*LDIM_2+LID_2;
-                tblock<nb_batch*nb_stack*grid_c*grid_d;
-                tblock+=GDIM_0*LDIM_2){
-                const ga_int b = tblock%%grid_d;
-                ga_int left = tblock/grid_d;
-                const ga_int a = left%%grid_c;
-                left = left/grid_c;
-                const ga_int s = left%%nb_stack;
-                left = left/nb_stack;
-                const ga_int n = left;
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            ga_int z_row = b + grid_d*(a + grid_c*
-                                                    (s + nb_stack*n));
-                            ga_int i = LID_1;     // loop over c
-                            {
-                                ga_int ten4_2 = i + a * step_x;
-                                if(mode == MODE_WRAP_CENTERED) {
-                                    ten4_2 -= wrap_centered_half_idx_shift_x;
-                                    if ( ten4_2 < 0 )
-                                        ten4_2 += height;
-                                    else if (ten4_2 >= height)
-                                        ten4_2 -= height;
-                                } else if (mode == MODE_HALF) {
-                                    ten4_2 -= wrap_centered_half_idx_shift_x;
-                                } else if (mode == MODE_FULL) {
-                                    ten4_2 -= c - 1;
-                                }
-                                ga_int j = LID_0;  // loop over d
-                                {
-                                    ga_int ten4_3 = j + b * step_y;
-                                    if(mode == MODE_WRAP_CENTERED){
-                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                        if ( ten4_3 < 0 )
-                                            ten4_3 += width;
-                                        else if (ten4_3 >= width)
-                                            ten4_3 -= width;
-                                    } else if (mode == MODE_HALF) {
-                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if (mode == MODE_FULL) {
-                                        ten4_3 -= d - 1;
-                                    }
-                                    ga_int z_col = j + d * i;
-                                    ga_int z_idx = z_col * out_s1 +
-                                                z_row * out_s0;
-                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
-                                        global_out[z_idx] = 0;
-                                    } else {
-                                        ga_int ten4_idx = stride3*ten4_3 +
-                                                       stride2*ten4_2 +
-                                                       stride1*s + stride0*n;
-                                        global_out[z_idx] = global_ten4[ten4_idx];
-                                    }
-                                }
-                            }
-            }
-        }""" % dict(
-            kname=kname,
-            type_ten4=type_ten4,
-            type_z=type_z,
-            mode_constants=mode_constants,
-        )
-        params = [
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "uintp",
-            "uintp",
-            "uintp",
-            "uintp",
-            gpuarray.GpuArray,
-            "uintp",
-            "uintp",
-            "uintp",
-            gpuarray.GpuArray,
-            "uintp",
-        ]
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        kname = "k_multi_warp"
-        k_var = "k_multi_warp_" + nodename
-        code = """#include "cluda.h"
-        %(mode_constants)s
-        KERNEL void %(kname)s(
-            const ga_int mode,
-            const ga_int nb_batch,
-            const ga_int nb_stack,
-            const ga_int height,
-            const ga_int width,
-            const ga_int c,
-            const ga_int d,
-            const ga_int step_x,
-            const ga_int step_y,
-            const ga_int grid_c,
-            const ga_int grid_d,
-            const ga_size stride0, const ga_size stride1,
-            const ga_size stride2, const ga_size stride3,
-            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
-            const ga_size out_s0, const ga_size out_s1,
-            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
-        )
-        {
-            const ga_int wrap_centered_half_idx_shift_x = c/2;
-            const ga_int wrap_centered_half_idx_shift_y = d/2;
-            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
-            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);
-            for(ga_int tblock = GID_0*LDIM_2+LID_2;
-                tblock<nb_batch*nb_stack*grid_c*grid_d;
-                tblock+=GDIM_0*LDIM_2){
-                const ga_int b = tblock%%grid_d;
-                ga_int left = tblock/grid_d;
-                const ga_int a = left%%grid_c;
-                left = left/grid_c;
-                const ga_int s = left%%nb_stack;
-                left = left/nb_stack;
-                const ga_int n = left;
-                if(n>nb_batch)continue;
-                if(s>nb_stack)continue;
-                if(a>grid_c)continue;
-                if(b>grid_d)continue;
-                            ga_int z_row = b + grid_d*(a + grid_c*
-                                                    (s + nb_stack*n));
-                            // loop over c
-                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
-                            {
-                                ga_int ten4_2 = i + a * step_x;
-                                if(mode == MODE_WRAP_CENTERED) {
-                                    ten4_2 -= wrap_centered_half_idx_shift_x;
-                                    if ( ten4_2 < 0 )
-                                        ten4_2 += height;
-                                    else if (ten4_2 >= height)
-                                        ten4_2 -= height;
-                                } else if (mode == MODE_HALF) {
-                                    ten4_2 -= wrap_centered_half_idx_shift_x;
-                                } else if (mode == MODE_FULL) {
-                                    ten4_2 -= c - 1;
-                                }
-                                // loop over d
-                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
-                                {
-                                    ga_int ten4_3 = j + b * step_y;
-                                    if(mode == MODE_WRAP_CENTERED) {
-                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                        if ( ten4_3 < 0 )
-                                            ten4_3 += width;
-                                        else if (ten4_3 >= width)
-                                            ten4_3 -= width;
-                                    } else if (mode == MODE_HALF) {
-                                        ten4_3 -= wrap_centered_half_idx_shift_y;
-                                    } else if (mode == MODE_FULL) {
-                                        ten4_3 -= d - 1;
-                                    }
-                                    ga_int z_col = j + d * i;
-                                    ga_int z_idx = z_col * out_s1 +
-                                                z_row * out_s0;
-                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
-                                        global_out[z_idx] = 0;
-                                    } else {
-                                        ga_int ten4_idx = stride3*ten4_3 +
-                                                       stride2*ten4_2 +
-                                                       stride1*s + stride0*n;
-                                        global_out[z_idx] = global_ten4[ten4_idx];
-                                    }
-                                }
-                            }
-            }
-        }
-        """ % dict(
-            kname=kname,
-            type_ten4=type_ten4,
-            type_z=type_z,
-            mode_constants=mode_constants,
-        )
-        params = [
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "intc",
-            "uintp",
-            "uintp",
-            "uintp",
-            "uintp",
-            gpuarray.GpuArray,
-            "uintp",
-            "uintp",
-            "uintp",
-            gpuarray.GpuArray,
-            "uintp",
-        ]
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        return kernels
-    def c_support_code(self, **kwargs):
-        return """
-        template <typename T>
-        static T ceil_intdiv(T a, T b)
-        {
-            return (a/b) + ((a % b) ? 1: 0);
-        }
-        """
-    def c_code(self, node, name, inp, out, sub):
-        err_check = """
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: *fptr: %%s.",
-                             GpuKernel_error(fptr, err));
-                %(fail)s;
-            }
-        """ % dict(
-            fail=sub["fail"]
-        )
-        # NB: To reduce C code variability:
-        # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
-        # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
-        # For typecode_z, I use ten4->ga.typecode (for same reason as above)
-        return """
-        int grid_c = -1;
-        int grid_d = -1;
-        size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
-        size_t itemsize_z = itemsize_ten4;
-        int typecode_z = %(ten4)s->ga.typecode;
-        {
-            if (PyGpuArray_NDIM(%(ten4)s) != 4)
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs: pvals wrong rank");
-                %(fail)s;
-            }
-            if (PyArray_NDIM(%(neib_shape)s) != 1)
-            {
-                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs: unis wrong rank");
-                %(fail)s;
-            }
-            if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "GpuImages2Neibs: neib_shape has to contain two"
-                             " elements");
-                %(fail)s;
-            }
-            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 0);
-            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 1);
-            if (step_x <=0 || step_y <=0)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "neib_step wrong step ; values <= 0. Got %%lld %%lld.",
-                             (long long) step_x, (long long) step_y);
-                %(fail)s;
-            }
-            if (c <=0 || d <=0)
-            {
-                PyErr_Format(PyExc_ValueError,
-                             "neib_shape values <= 0. Got %%lld %%lld.",
-                             (long long)c, (long long)d);
-                %(fail)s;
-            }
-            if (%(params)s->mode == MODE_WRAP_CENTERED) {
-                if (c%%2!=1 || d%%2!=1){
-                    PyErr_Format(PyExc_TypeError,
-                                 "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
-                    %(fail)s;
-                }
-                if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
-                     PyGpuArray_DIMS(%(ten4)s)[3] < d)
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "GpuImages2Neibs: in wrap_centered mode,"
-                                 " don't support image shapes smaller then"
-                                 " the patch shapes: neib_shape=(%%d,%%d),"
-                                 " ten4[2:]=[%%d,%%d]",
-                                 c, d, PyGpuArray_DIMS(%(ten4)s)[2],
-                                 PyGpuArray_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
-                                     (size_t)step_x);
-                grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
-                                     (size_t)step_y);
-            } else if (%(params)s->mode == MODE_VALID) {
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
-                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
-                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
-                                 " ten4.shape[2]=%%d not consistent",
-                                 c, step_x,
-                                 PyGpuArray_DIMS(%(ten4)s)[2]);
-                    %(fail)s;
-                }
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
-                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
-                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
-                                 " ten4.shape[3]=%%d not consistent",
-                                 d, step_y,
-                                 PyGpuArray_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                //number of patch in height
-                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
-            } else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
-                //number of patch in height
-                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
-                //number of patch in width
-                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
-            } else if (%(params)s->mode == MODE_HALF) {
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
-                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
-                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
-                                 " ten4.shape[2]=%%d not consistent",
-                                 c, step_x,
-                                 PyGpuArray_DIMS(%(ten4)s)[2]);
-                    %(fail)s;
-                }
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
-                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
-                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
-                                 " ten4.shape[3]=%%d not consistent",
-                                 d, step_y,
-                                 PyGpuArray_DIMS(%(ten4)s)[3]);
-                    %(fail)s;
-                }
-                //number of patch in height
-                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
-                //number of patch in width
-                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
-            } else if (%(params)s->mode == MODE_FULL) {
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
-                 ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
-                                 " ten4.shape[2]=%%ld not consistent",
-                                 (long int)c, (long int)step_x,
-                                 (long int)(PyGpuArray_DIMS(%(ten4)s)[2]));
-                    %(fail)s;
-                }
-                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
-                     ( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0))
-                {
-                    PyErr_Format(PyExc_TypeError,
-                                 "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
-                                 " ten4.shape[3]=%%ld not consistent",
-                                 (long int)d, (long int)step_y,
-                                 (long int)(PyGpuArray_DIMS(%(ten4)s)[3]));
-                    %(fail)s;
-                }
-                //number of patch in height
-                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
-                //number of patch in width
-                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
-            } else {
-                PyErr_Format(PyExc_TypeError,
-                             "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
-                 %(fail)s;
-            }
-            // new dimensions for z
-            const int z_dim1 = c * d;
-            const int z_dim0 =  grid_c
-                                * grid_d
-                                * PyGpuArray_DIMS(%(ten4)s)[1]
-                                * PyGpuArray_DIMS(%(ten4)s)[0];
-            if ((NULL == %(z)s)
-                || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0)
-                || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1))
-            {
-                Py_XDECREF(%(z)s);
-                size_t dims[2];
-                dims[0] = z_dim0;
-                dims[1] = z_dim1;
-                %(z)s = pygpu_empty(2, dims, typecode_z,
-                                    GA_C_ORDER, %(params)s->context, Py_None);
-                if (!%(z)s)
-                {
-                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
-                                    " failed to alloc z output");
-                    %(fail)s;
-                }
-            }
-        }
-        { // NESTED SCOPE
-            const int mode = %(params)s->mode;
-            const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
-            const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
-            const int height = PyGpuArray_DIMS(%(ten4)s)[2];
-            const int width = PyGpuArray_DIMS(%(ten4)s)[3];
-            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 0);
-            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
-                                                     %(neib_shape)s, 1);
-            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 0);
-            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
-                                         PyArray_GETPTR1(%(neib_step)s, 1);
-            size_t threads_per_block[3] = {d, c, 1};
-            //get the max threads per blocks
-            size_t max_threads_dim;
-            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
-            if (err != GA_NO_ERROR){
-                PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
-                %(fail)s;
-            }
-            while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--;
-            while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--;
-            //Make bigger block to have better memory access pattern and
-            //a higher core utilisation. for smaller patch size
-            while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
-                  threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
-                threads_per_block[2]++;
-            }
-            int nb_block;
-            if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
-                nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
-            else
-                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
-            size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};
-            GpuKernel *fptr;
-            if(threads_per_block[0]==d && threads_per_block[1]==c){
-                fptr = &k_multi_warp_less_%(name)s;
-            }else{
-                fptr = &k_multi_warp_%(name)s;
-            }
-            /*
-            printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
-                   max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
-                   n_blocks[0], n_blocks[1], n_blocks[2]);
-            */
-            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
-            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
-            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
-            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
-            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
-            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
-            void *kernel_params[] = {(void *)&mode,
-                                     (void *)&nb_batch,
-                                     (void *)&nb_stack,
-                                     (void *)&height, (void *)&width,
-                                     (void *)&c, (void *)&d,
-                                     (void *)&step_x, (void *)&step_y,
-                                     (void *)&grid_c, (void *)&grid_d,
-                                     (void *)&stride_A0,
-                                     (void *)&stride_A1,
-                                     (void *)&stride_A2,
-                                     (void *)&stride_A3,
-                                     (void *)%(ten4)s->ga.data,
-                                     (void *)&%(ten4)s->ga.offset,
-                                     (void *)&stride_Z0,
-                                     (void *)&stride_Z1,
-                                     (void *)%(z)s->ga.data,
-                                     (void *)&%(z)s->ga.offset};
-            err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
-            %(err_check)s
-        } // END NESTED SCOPE
-        """ % dict(
-            ten4=inp[0],
-            neib_shape=inp[1],
-            neib_step=inp[2],
-            z=out[0],
-            dtype_neib_shape=node.inputs[1].dtype,
-            dtype_neib_step=node.inputs[2].dtype,
-            err_check=err_check,
-            name=name,
-            params=sub["params"],
-            fail=sub["fail"],
-        )
--- a/aesara/gpuarray/nnet.py
+++ b/aesara/gpuarray/nnet.py
-from io import StringIO
-import numpy as np
-from aesara.graph.basic import Apply
-from aesara.graph.op import _NoPythonOp
-try:
-    import pygpu
-    from pygpu import gpuarray
-except ImportError:
-    pass
-from aesara.gpuarray.basic_ops import (
-    GpuKernelBaseCOp,
-    Kernel,
-    as_gpuarray_variable,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.fp16_help import load_w, work_dtype, write_w
-from aesara.gpuarray.type import GpuArrayType
-class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
-    """
-    nin = 3
-    nout = 3
-    __props__ = ()
-    _f16_ok = True
-    def make_node(self, x, b, y_idx):
-        ctx_name = infer_context_name(x, b, y_idx)
-        x = as_gpuarray_variable(x, ctx_name)
-        b = as_gpuarray_variable(b, ctx_name)
-        y_idx = as_gpuarray_variable(y_idx, ctx_name)
-        nll = GpuArrayType(
-            x.type.dtype, y_idx.type.broadcastable, context_name=ctx_name
-        )()
-        sm = x.type()
-        am = y_idx.type()
-        return Apply(self, [x, b, y_idx], [nll, sm, am])
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray/types.h>", "gpuarray_helper.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def gpu_kernels(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        work_x = work_dtype(dtype_x)
-        work_b = work_dtype(dtype_b)
-        load_x = load_w(dtype_x)
-        load_b = load_w(dtype_b)
-        write_x = write_w(dtype_x)
-        write_b = write_w(dtype_b)
-        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
-        type_x = gpuarray.dtype_to_ctype(dtype_x)
-        type_b = gpuarray.dtype_to_ctype(dtype_b)
-        work_x = gpuarray.dtype_to_ctype(work_x)
-        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
-        kname = "k_xent_sm_1hot_bias"
-        k_var = "k_xent_sm_1hot_bias_" + nodename
-        if node.inputs[0].type.context.kind != b"cuda":
-            f = ""
-        else:
-            f = "" if dtype_x == "float64" else "f"
-        params = [
-            gpuarray.SIZE,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-        ]
-        sio = StringIO()
-        print(
-            """#include "cluda.h"
-        KERNEL void %(kname)s(const ga_size M, const ga_size N,
-            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
-            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
-            GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
-            GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
-            GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
-            GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
-        {
-          x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
-          b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
-          y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
-          nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
-          sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
-          am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
-          for (ga_int row = GID_0; row < M; row += GDIM_0){
-            GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
-            GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
-            GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
-            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
-            LOCAL_MEM ga_int row_max_threadIdx;
-            %(work_x)s per_thread_row_max, per_thread_sum;
-            ga_int per_thread_row_max_j;
-            // COMPUTE ROW MAX AND ARGMAX
-            // compute separate per-thread maximums and argmaxes
-            per_thread_row_max = NAN;
-            per_thread_row_max_j = 0;
-            for (ga_int j = LID_0; j < N; j += LDIM_0)
-            {
-              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
-              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
-              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
-            }
-            per_thread_values[LID_0] = per_thread_row_max;
-            local_barrier();
-            if (LID_0 == 0) {
-              row_max = NAN;
-              row_max_threadIdx = 0;
-              for (ga_int j = 0; j < LDIM_0; j++)
-              {
-                %(work_x)s per_thread_max = per_thread_values[j];
-                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
-                row_max = fmax%(f)s(per_thread_max, row_max);
-              }
-            }
-            local_barrier();
-            // The thread with the highest max writes out which of its
-            // values was the winner.
-            if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
-            // COMPUTE SOFTMAX
-            per_thread_sum = 0.0;
-            for (ga_int j = LID_0; j < N; j += LDIM_0)
-            {
-              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
-              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
-              per_thread_sum += sm_ij;
-              sm[j * sms1] = %(write_x)s(sm_ij);
-            }
-            per_thread_values[LID_0] = per_thread_sum;
-            local_barrier();
-            if (LID_0 == 0) {
-              sum = 0.0;
-              for (ga_int j = 0; j < LDIM_0; j++) {
-                sum += per_thread_values[j];
-              }
-              sum_inv = 1.0 / sum;
-            }
-            local_barrier();
-            for (ga_int j = LID_0; j < N; j += LDIM_0) {
-              sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
-            }
-            if (LID_0 == 0) {
-              const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
-              if ((y_idx >= N || y_idx < 0)) {
-                // raise some suspicion.
-                nll_data[row * nlls0] = %(write_x)s(0.0);
-              } else {
-                nll_data[row * nlls0] = %(write_x)s(
-                   - %(load_x)s(x[y_idx * xs1])
-                   - %(load_b)s(b[y_idx * bs0])
-                   + row_max + log%(f)s(sum));
-              }
-            }
-          }
-        }
-        """
-            % locals(),
-            file=sio,
-        )
-        return [
-            Kernel(
-                code=sio.getvalue(),
-                name=kname,
-                params=params,
-                flags=flags,
-                objvar=k_var,
-            )
-        ]
-    def c_code(self, node, nodename, inp, out, sub):
-        itemsize_x = np.dtype(node.inputs[0].dtype).itemsize
-        worksize_x = np.dtype(work_dtype(node.inputs[0].dtype)).itemsize
-        itemsize_b = np.dtype(node.inputs[1].dtype).itemsize
-        itemsize_y_idx = np.dtype(node.inputs[2].dtype).itemsize
-        itemsize_nll = np.dtype(node.outputs[0].dtype).itemsize
-        itemsize_sm = np.dtype(node.outputs[1].dtype).itemsize
-        itemsize_am = np.dtype(node.outputs[2].dtype).itemsize
-        x, b, y_idx = inp
-        nll, sm, am = out
-        fail = sub["fail"]
-        ctx = sub["params"]
-        k_var = f"k_xent_sm_1hot_bias_{nodename}"
-        err_check = (
-            """
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: %(k_var)s: %%s.",
-                             GpuKernel_error(&%(k_var)s, err));
-                %(fail)s;
-            }
-        """
-            % locals()
-        )
-        sio = StringIO()
-        print(
-            """
-        if (PyGpuArray_DIMS(%(x)s)[0] !=
-            PyGpuArray_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dimension mismatch in x,y_idx arguments");
-            %(fail)s;
-        }
-        if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dimension mismatch in x,b arguments");
-            %(fail)s;
-        }
-        if (aesara_prep_output(&%(nll)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-        if (aesara_prep_output(&%(sm)s, 2, PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-        if (aesara_prep_output(&%(am)s, 1, PyGpuArray_DIMS(%(y_idx)s), %(y_idx)s->ga.typecode, GA_C_ORDER, %(ctx)s)) %(fail)s
-        {
-            size_t n_blocks = std::min(PyGpuArray_DIM(%(x)s, 0), (size_t)4096);
-            size_t n_threads = std::min(PyGpuArray_DIM(%(x)s, 1), (size_t)256);
-            size_t n_shared = n_threads * %(worksize_x)s;
-     //TODO: launch more threads per row and do parallel sum and max reductions
-            int err = k_xent_sm_1hot_bias_call(
-                1, &n_blocks, &n_threads, n_shared,
-                PyGpuArray_DIMS(%(x)s)[0],
-                PyGpuArray_DIMS(%(x)s)[1],
-                %(x)s->ga.data, %(x)s->ga.offset,
-                PyGpuArray_STRIDE(%(x)s, 0) / %(itemsize_x)s,
-                PyGpuArray_STRIDE(%(x)s, 1) / %(itemsize_x)s,
-                %(b)s->ga.data, %(b)s->ga.offset,
-                PyGpuArray_STRIDE(%(b)s, 0) / %(itemsize_b)s,
-                %(y_idx)s->ga.data, %(y_idx)s->ga.offset,
-                PyGpuArray_STRIDE(%(y_idx)s, 0) / %(itemsize_y_idx)s,
-                %(nll)s->ga.data, %(nll)s->ga.offset,
-                PyGpuArray_STRIDE(%(nll)s, 0) / %(itemsize_nll)s,
-                %(sm)s->ga.data, %(sm)s->ga.offset,
-                PyGpuArray_STRIDE(%(sm)s, 0) / %(itemsize_sm)s,
-                PyGpuArray_STRIDE(%(sm)s, 1) / %(itemsize_sm)s,
-                %(am)s->ga.data, %(am)s->ga.offset,
-                PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
-            %(err_check)s
-        }
-        """
-            % locals(),
-            file=sio,
-        )
-        return sio.getvalue()
-    def c_code_cache_version(self):
-        return (14,)
-gpu_crossentropy_softmax_argmax_1hot_with_bias = (
-    GpuCrossentropySoftmaxArgmax1HotWithBias()
-)
-class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
-    Gradient wrt x of the CrossentropySoftmax1Hot Op.
-    """
-    nin = 3
-    nout = 1
-    __props__ = ()
-    _f16_ok = True
-    def make_node(self, dnll, sm, y_idx):
-        ctx_name = infer_context_name(dnll, sm, y_idx)
-        dnll = as_gpuarray_variable(dnll, ctx_name)
-        sm = as_gpuarray_variable(sm, ctx_name)
-        y_idx = as_gpuarray_variable(y_idx, ctx_name)
-        return Apply(self, [dnll, sm, y_idx], [sm.type()])
-    def c_code_cache_version(self):
-        return (14,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray/types.h>"]
-    def c_code(self, node, nodename, inp, out, sub):
-        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        itemsize_dnll = np.dtype(node.inputs[0].dtype).itemsize
-        itemsize_sm = np.dtype(node.inputs[1].dtype).itemsize
-        itemsize_y_idx = np.dtype(node.inputs[2].dtype).itemsize
-        itemsize_dx = np.dtype(node.outputs[0].dtype).itemsize
-        dtype_dnll = node.inputs[0].dtype
-        dtype_sm = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_dx = node.outputs[0].dtype
-        type_intp = gpuarray.dtype_to_ctype(np.intp)
-        dnll, sm, y_idx = inp
-        (dx,) = out
-        fail = sub["fail"]
-        ctx = sub["params"]
-        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
-        err_check = (
-            """
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: %(k_var)s: %%s.",
-                             GpuKernel_error(&%(k_var)s, err));
-                %(fail)s;
-            }
-        """
-            % locals()
-        )
-        return (
-            """
-        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
-        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
-                                        PyGpuArray_DIMS(%(dnll)s)[0] :
-                                        (ssize_t) 0);
-        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
-        // or a vector with just one element.
-        const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
-                                           PyGpuArray_STRIDES(%(dnll)s)[0] :
-                                           (ssize_t) 0);
-        if ((PyGpuArray_NDIM(%(dnll)s) > 1)
-            || (PyGpuArray_NDIM(%(sm)s) != 2)
-            || (PyGpuArray_NDIM(%(y_idx)s) != 1))
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if (%(dnll)s_dims0 !=
-            PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
-                         %(dnll)s_dims0,
-                         PyGpuArray_DIMS(%(sm)s)[0]);
-            %(fail)s;
-        }
-        if (%(dnll)s_dims0 !=
-            PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "dnll.shape[0] != y_idx.shape[0]");
-            %(fail)s;
-        }
-        if (PyGpuArray_DIMS(%(sm)s)[0] !=
-            PyGpuArray_DIMS(%(y_idx)s)[0])
-        {
-            PyErr_SetString(PyExc_ValueError,
-                            "sm.shape[0] != y_idx.shape[0]");
-            %(fail)s;
-        }
-        if ((NULL == %(dx)s)
-            || (PyGpuArray_DIMS(%(dx)s)[0] !=
-                PyGpuArray_DIMS(%(sm)s)[0])
-            || (PyGpuArray_DIMS(%(dx)s)[1] !=
-                PyGpuArray_DIMS(%(sm)s)[1]))
-        {
-            Py_XDECREF(%(dx)s);
-            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
-                                 %(typecode_dx)s, GA_C_ORDER,
-                                 %(ctx)s, Py_None);
-            if (!%(dx)s) {
-                %(fail)s
-            }
-        }
-        {
-            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
-            ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
-            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
-            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
-            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
-            ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
-            ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
-            void *kernel_params[] = {
-                (void *)&PyGpuArray_DIMS(%(dx)s)[0],
-                (void *)&PyGpuArray_DIMS(%(dx)s)[1],
-                (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
-                (void *)&stride_DNLL0,
-                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
-                (void *)&stride_SM0, (void *)&stride_SM1,
-                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
-                (void *)&stride_YIDX0,
-                (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
-                (void *)&stride_DX0, (void *)&stride_DX1};
-            int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
-            %(err_check)s
-        }
-        assert(%(dx)s);
-        """
-            % locals()
-        )
-    def gpu_kernels(self, node, nodename):
-        dtype_dnll = node.inputs[0].dtype
-        dtype_sm = node.inputs[1].dtype
-        dtype_y_idx = node.inputs[2].dtype
-        dtype_dx = node.outputs[0].dtype
-        work_dnll = work_dtype(dtype_dnll)
-        load_dnll = load_w(dtype_dnll)
-        load_sm = load_w(dtype_sm)
-        write_dx = write_w(dtype_dx)
-        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
-        wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
-        type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
-        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
-        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
-        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
-        kname = "kCrossEntropySoftmax1HotWithBiasDx"
-        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
-        params = [
-            gpuarray.SIZE,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-        ]
-        sio = StringIO()
-        print(
-            """#include "cluda.h"
-        KERNEL void %(kname)s(
-           const ga_size N, const ga_size K,
-           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
-           GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
-           GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
-           GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
-        {
-            dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
-            sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
-            y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
-            dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
-            for (ga_int i = GID_0; i < N; i += GDIM_0)
-            {
-                %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
-                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
-                for (ga_int j = LID_0; j < K; j += LDIM_0)
-                {
-                    if (y_i == j)
-                    {
-                        dx[i * dx_s0 + j * dx_s1] =
-                            %(write_dx)s(dnll_i *
-                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
-                    }
-                    else
-                    {
-                        dx[i * dx_s0 + j * dx_s1] =
-                            %(write_dx)s(dnll_i *
-                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
-                    }
-                }
-            }
-        }
-        """
-            % locals(),
-            file=sio,
-        )
-        return [
-            Kernel(
-                code=sio.getvalue(),
-                name=kname,
-                params=params,
-                flags=flags,
-                objvar=k_var,
-            )
-        ]
-gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Implement Softmax on the gpu.
-    """
-    __props__ = ()
-    _f16_ok = True
-    def make_node(self, x):
-        x = as_gpuarray_variable(x, infer_context_name(x))
-        return Apply(self, [x], [x.type()])
-    def infer_shape(self, fgraph, node, shape):
-        return shape
-    def c_code_cache_version(self):
-        return (17,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray/types.h>"]
-    def c_code(self, node, nodename, inp, out, sub):
-        dtype_x = node.inputs[0].dtype
-        work_x = work_dtype(dtype_x)
-        dtype_z = node.outputs[0].dtype
-        itemsize_x = np.dtype(dtype_x).itemsize
-        itemsize_z = np.dtype(dtype_z).itemsize
-        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        (x,) = inp
-        (z,) = out
-        fail = sub["fail"]
-        ctx = sub["params"]
-        err_check = (
-            """
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
-                %(fail)s;
-            }
-        """
-            % locals()
-        )
-        return (
-            """
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error");
-            %(fail)s;
-        }
-        if ((NULL == %(z)s) ||
-            (PyGpuArray_DIMS(%(z)s)[0] !=
-             PyGpuArray_DIMS(%(x)s)[0]) ||
-            (PyGpuArray_DIMS(%(z)s)[1] !=
-             PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s, GA_C_ORDER,
-                                %(ctx)s, Py_None);
-            if (!%(z)s) {
-                %(fail)s
-            }
-        }
-        {
-            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
-//TODO, detect the maximum number of thread per block.
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
-            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
-                                     2 * sizeof(npy_%(work_x)s);
-            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
-            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
-            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
-            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
-            const char *fmt_str, *msg;
-            void *kernel_params[] = {
-                (void *)&PyGpuArray_DIMS(%(x)s)[0],
-                (void *)&PyGpuArray_DIMS(%(x)s)[1],
-                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
-                (void *)&stride_X0, (void *)&stride_X1,
-                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
-                (void *)&stride_Z0, (void *)&stride_Z1};
-            int err = GA_NO_ERROR;
-            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
-            {
-              //Those numbers are based on not too recent GPU
-              //to make them compatible with more GPU.
-              //TODO: read the information from the card.
-              if(shmem_sz < (32 * 1024 - 500)){
-                err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
-                                     n_blocks, threads_per_block, shmem_sz,
-                                     kernel_params);
-                fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
-                msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
-              }else{
-                err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
-                                     n_blocks, threads_per_block,
-                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
-                                     kernel_params);
-                fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
-                msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
-              }
-              %(err_check)s
-            }
-        }
-        assert(%(z)s);
-        """
-            % locals()
-        )
-    def gpu_kernels(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_sm = node.outputs[0].dtype
-        load_x = load_w(dtype_x)
-        write_sm = write_w(node.outputs[0].dtype)
-        work_sm = work_dtype(dtype_sm)
-        flags = Kernel.get_flags(dtype_x, dtype_sm)
-        type_x = gpuarray.dtype_to_ctype(dtype_x)
-        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
-        type_acc = gpuarray.dtype_to_ctype(work_sm)
-        ctype = gpuarray.dtype_to_ctype(work_sm)
-        params = [
-            gpuarray.SIZE,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-        ]
-        kernels = []
-        kname = "kSoftmax"
-        k_var = "kSoftmax_" + nodename
-        code = (
-            """#include "cluda.h"
-        KERNEL void %(kname)s (const ga_size M, const ga_size N,
-                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
-                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
-        {
-            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
-            LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
-            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
-            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
-            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0) {
-                for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
-                    buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
-                    buf2[tx] = buf[tx];
-                }
-                local_barrier();
-                {
-                    // This function trashes buf[1..GA_WARP_SIZE],
-                    // leaving the reduction result in buf[0].
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
-                        {
-                            buf[LID_0] = max(buf[LID_0], buf[i]);
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_max = buf[0];
-                local_barrier();
-                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
-                    buf[__i] = exp(buf2[__i] - row_max);
-                    buf2[__i] = buf[__i];
-                }
-                local_barrier();
-                {
-                    // This function trashes buf[1..GA_WARP_SIZE],
-                    // leaving the reduction result in buf[0].
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
-                        {
-                            buf[LID_0] = buf[LID_0] + buf[i];
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_sum = buf[0];
-                local_barrier();
-                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0) {
-                    buf[__i] = buf2[__i] / row_sum;
-                }
-                local_barrier();
-                for (ga_int tx = LID_0; tx< N; tx += LDIM_0) {
-                    sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
-                }
-                local_barrier();
-            }
-        }
-        """
-            % locals()
-        )
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        kname = "kSoftmax_fixed_shared"
-        k_var = "kSoftmax_fixed_shared" + nodename
-        code = (
-            """#include "cluda.h"
-        KERNEL void %(kname)s (const ga_size M, const ga_size N,
-                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
-                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
-        {
-            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
-            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
-            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
-            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
-                GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
-                GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
-                {
-                    // This function trashes buf[1..n_threads],
-                    // leaving the reduction result in buf[0].
-                    %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]);
-                    #pragma unroll 16
-                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
-                        red = max(red, %(load_x)s(x_ptr[i * sx1]));
-                    }
-                    buf[LID_0] = red;
-                    local_barrier();
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
-                            buf[LID_0] = max(buf[LID_0], buf[i]);
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_max = buf[0];
-                local_barrier();
-                {
-                    // This function trashes buf[1..n_threads],
-                    // leaving the reduction result in buf[0].
-                    %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) - row_max);
-                    #pragma unroll 16
-                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
-                        red = red + exp(%(load_x)s(x_ptr[i * sx1]) - row_max);
-                    }
-                    buf[LID_0] = red;
-                    local_barrier();
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
-                            buf[LID_0] = buf[LID_0] + buf[i];
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_sum = buf[0];
-                local_barrier();
-                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
-                    sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) - row_max) / row_sum);
-                }
-                local_barrier();
-            }
-        }
-        """
-            % locals()
-        )
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        return kernels
-gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias(GpuKernelBaseCOp, _NoPythonOp):
-    """
-    Implement SoftmaxWithBias on the gpu.
-    """
-    nin = 2
-    nout = 1
-    __props__ = ()
-    _f16_ok = True
-    def make_node(self, x, b):
-        ctx_name = infer_context_name(x, b)
-        x = as_gpuarray_variable(x, ctx_name)
-        b = as_gpuarray_variable(b, ctx_name)
-        return Apply(self, [x, b], [x.type()])
-    def infer_shape(self, fgraph, node, shape):
-        return [shape[0]]
-    def c_code_cache_version(self):
-        return (16,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray/types.h>"]
-    def c_code(self, node, nodename, inp, out, sub):
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_z = node.outputs[0].dtype
-        work_x = work_dtype(dtype_x)
-        itemsize_x = np.dtype(dtype_x).itemsize
-        itemsize_b = np.dtype(dtype_b).itemsize
-        itemsize_z = np.dtype(dtype_z).itemsize
-        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        x, b = inp
-        (z,) = out
-        fail = sub["fail"]
-        ctx = sub["params"]
-        err_check = (
-            """
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
-                %(fail)s;
-            }
-        """
-            % locals()
-        )
-        return (
-            """
-        if (PyGpuArray_NDIM(%(x)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error input");
-            %(fail)s;
-        }
-        if (PyGpuArray_NDIM(%(b)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
-            %(fail)s;
-        }
-        if ((PyGpuArray_DIMS(%(x)s)[1] !=
-            PyGpuArray_DIMS(%(b)s)[0]))
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "number of columns in x (%%ld)"
-                         " does not match length of b (%%ld)",
-                         (long int)PyGpuArray_DIMS(%(x)s)[1],
-                         (long int)PyGpuArray_DIMS(%(b)s)[0]);
-            %(fail)s;
-        }
-        if ((NULL == %(z)s)
-            || (PyGpuArray_DIMS(%(z)s)[0] !=
-                PyGpuArray_DIMS(%(x)s)[0])
-            || (PyGpuArray_DIMS(%(z)s)[1] !=
-                PyGpuArray_DIMS(%(x)s)[1]))
-        {
-            Py_XDECREF(%(z)s);
-            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
-                                %(typecode)s, GA_C_ORDER,
-                                %(ctx)s, Py_None);
-            if (!%(z)s) {
-                %(fail)s
-            }
-        }
-        {
-            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
-//TODO, detect the maximum number of thread per block.
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
-            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
-                                     2 * sizeof(npy_%(work_x)s);
-            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
-            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
-            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
-            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
-            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
-            const char *fmt_str, *msg;
-            void *kernel_params[] = {
-                (void *)&PyGpuArray_DIMS(%(x)s)[0],
-                (void *)&PyGpuArray_DIMS(%(x)s)[1],
-                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
-                (void *)&stride_X0, (void *)&stride_X1,
-                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
-                (void *)&stride_B0,
-                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
-                (void *)&stride_Z0, (void *)&stride_Z1};
-            int err = GA_NO_ERROR;
-            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
-            {
-              if(shmem_sz < (32 * 1024 - 500)){
-                err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
-                                     n_blocks, threads_per_block, shmem_sz,
-                                     kernel_params);
-                fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
-                msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
-              }else{
-                err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
-                                     3, n_blocks, threads_per_block,
-                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
-                                     kernel_params);
-                fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
-                msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
-              }
-              %(err_check)s
-            }
-        }
-        assert(%(z)s);
-        """
-            % locals()
-        )
-    def gpu_kernels(self, node, nodename):
-        dtype_x = node.inputs[0].dtype
-        dtype_b = node.inputs[1].dtype
-        dtype_sm = node.outputs[0].dtype
-        load_x = load_w(node.inputs[0].dtype)
-        load_b = load_w(node.inputs[1].dtype)
-        write_sm = write_w(node.outputs[0].dtype)
-        work_sm = work_dtype(node.outputs[0].dtype)
-        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
-        type_x = gpuarray.dtype_to_ctype(dtype_x)
-        type_b = gpuarray.dtype_to_ctype(dtype_b)
-        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
-        type_acc = gpuarray.dtype_to_ctype(work_sm)
-        ctype = gpuarray.dtype_to_ctype(work_sm)
-        params = [
-            gpuarray.SIZE,
-            gpuarray.SIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.GpuArray,
-            gpuarray.SIZE,
-            gpuarray.SSIZE,
-            gpuarray.SSIZE,
-        ]
-        kernels = []
-        kname = "kSoftmaxWithBias"
-        k_var = "kSoftmaxWithBias_" + nodename
-        code = (
-            """#include "cluda.h"
-        KERNEL void %(kname)s (const ga_size M, const ga_size N,
-                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
-                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
-                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
-        {
-            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
-            LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
-            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
-            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
-            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
-            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
-                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
-                    buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
-                    buf[tx] += %(load_b)s(b[tx * sb0]);
-                    buf2[tx] = buf[tx];
-                }
-                local_barrier();
-                {
-                    // This function trashes buf[1..GA_WARP_SIZE],
-                    // leaving the reduction result in buf[0].
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
-                        {
-                            buf[LID_0] = max(buf[LID_0], buf[i]);
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_max = buf[0];
-                local_barrier();
-                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
-                    buf[__i] = exp(buf2[__i] - row_max);
-                    buf2[__i] = buf[__i];
-                }
-                local_barrier();
-                {
-                    // This function trashes buf[1..GA_WARP_SIZE],
-                    // leaving the reduction result in buf[0].
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
-                        {
-                            buf[LID_0] = buf[LID_0] + buf[i];
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_sum = buf[0];
-                local_barrier();
-                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
-                    buf[__i] = buf2[__i] / row_sum;
-                }
-                local_barrier();
-                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
-                    sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
-                }
-                local_barrier();
-            }
-        }
-        """
-            % locals()
-        )
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        kname = "kSoftmaxWithBias_fixed_shared"
-        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
-        code = (
-            """#include "cluda.h"
-        KERNEL void %(kname)s (const ga_size M, const ga_size N,
-                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
-                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
-                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
-        {
-            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
-            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
-            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
-            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
-            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
-                GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
-                GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
-                {
-                    // This function trashes buf[1..n_threads],
-                    // leaving the reduction result in buf[0].
-                    %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
-                    #pragma unroll 16
-                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
-                        red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
-                    }
-                    buf[LID_0] = red;
-                    local_barrier();
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
-                            buf[LID_0] = max(buf[LID_0], buf[i]);
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_max = buf[0];
-                local_barrier();
-                {
-                    // This function trashes buf[1..n_threads],
-                    // leaving the reduction result in buf[0].
-                    %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
-                    #pragma unroll 16
-                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
-                    red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
-                    }
-                    buf[LID_0] = red;
-                    local_barrier();
-                    if (LID_0 < GA_WARP_SIZE) {
-                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
-                            buf[LID_0] = buf[LID_0] + buf[i];
-                        }
-                    }
-                    local_barrier();
-                    //reduce so that LID_0 0 has the reduction of everything
-                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
-                        if (LID_0 < _n && LID_0 + _n < N)
-                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
-                        local_barrier();
-                    }
-                }
-                %(ctype)s row_sum = buf[0];
-                local_barrier();
-                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
-                    sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
-                }
-                local_barrier();
-            }
-        }
-        """
-            % locals()
-        )
-        kernels.append(
-            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
-        )
-        return kernels
-gpu_softmax_with_bias = GpuSoftmaxWithBias()
--- a/aesara/gpuarray/opt.py
+++ b/aesara/gpuarray/opt.py
--- a/aesara/gpuarray/opt_util.py
+++ b/aesara/gpuarray/opt_util.py
-from functools import wraps
-import numpy as np
-from aesara import scalar as aes
-from aesara.gpuarray.basic_ops import (
-    GpuAllocEmpty,
-    GpuFromHost,
-    GpuReshape,
-    HostFromGpu,
-    host_from_gpu,
-)
-from aesara.gpuarray.elemwise import GpuDimShuffle, GpuElemwise
-from aesara.gpuarray.type import GpuArrayType, get_context, move_to_gpu
-from aesara.graph.basic import Constant
-from aesara.graph.op import Op
-from aesara.graph.opt import copy_stack_trace, local_optimizer
-from aesara.tensor.basic import as_tensor, cast, get_scalar_constant_value, join
-from aesara.tensor.elemwise import DimShuffle
-from aesara.tensor.exceptions import NotScalarConstantError
-from aesara.tensor.math import prod
-from aesara.tensor.shape import shape_padright
-from aesara.tensor.type import TensorType
-# Define a few operations to use in optimizations,
-# in order to avoid introducin new CPU Ops, or useless ones.
-def safe_to_gpu(x, ctx_name):
-    if isinstance(x.type, TensorType):
-        return GpuFromHost(ctx_name)(x)
-    else:
-        return x
-def safe_to_cpu(x):
-    if isinstance(x.type, GpuArrayType):
-        return x.transfer("cpu")
-    else:
-        return x
-def grab_cpu_scalar(v, nd):
-    """
-    Get a scalar variable value from the tree at `v`.
-    This function will dig through transfers and dimshuffles to get
-    the constant value. If no such constant is found, it returns None.
-    Parameters
-    ----------
-    v
-        Aesara variable to extract the constant value from.
-    nd : int
-        Expected number of dimensions for the variable (for
-        broadcasted constants).
-    """
-    if v.owner is not None:
-        n = v.owner
-        if (
-            isinstance(n.op, (GpuDimShuffle, DimShuffle))
-            and n.op.new_order == ("x",) * nd
-        ):
-            return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
-        elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
-            return grab_cpu_scalar(n.inputs[0], nd)
-        else:
-            return None
-    else:
-        if isinstance(v, Constant) and v.broadcastable == (True,) * nd:
-            return v.dimshuffle(())
-def find_node(fgraph, v, cls, ignore_clients=False):
-    """
-    Find the node that has an op of of type `cls` in `v`.
-    This digs through possibly redundant transfers to for the node
-    that has the type `cls`. If `ignore_clients` is False (the
-    default) it will only dig through nodes that have a single client
-    to avoid duplicating computations.
-    Parameters
-    ----------
-    v
-        The variable to dig through
-    cls : Op class
-        The type of the node we are looking for
-    ignore_clients : bool, optional
-        Whether to ignore multiple clients or not.
-    """
-    if v.owner is not None and (ignore_clients or len(fgraph.clients[v]) == 1):
-        if isinstance(v.owner.op, cls):
-            return v.owner
-        elif (
-            isinstance(v.owner.op, GpuFromHost)
-            and v.owner.inputs[0].owner is not None
-            and (ignore_clients or len(fgraph.clients[v.owner.inputs[0]]) == 1)
-            and isinstance(v.owner.inputs[0].owner.op, HostFromGpu)
-        ):
-            return find_node(fgraph, v.owner.inputs[0].owner.inputs[0], cls)
-        else:
-            return None
-def is_equal(var, val):
-    """
-    Returns True if `var` is always equal to `val`.
-    This will only return True if the variable will always be equal to
-    the value.  If it might not be true in some cases then it returns False.
-    Parameters
-    ----------
-    var
-        Variable to compare
-    val
-        Python value
-    """
-    try:
-        v = get_scalar_constant_value(var)
-        return v == val
-    except NotScalarConstantError:
-        return False
-def alpha_merge(cls, alpha_in, beta_in):
-    """
-    Decorator to merge multiplication by a scalar on the output.
-    This will find a pattern of `aes * <yourop>(some, params, alpha,
-    beta)` and update it so that the scalar multiplication happens as
-    part of your op.
-    The op needs to accept an alpha and a beta scalar which act this way::
-       out = Op() * alpha + out_like * beta
-    Where out_like is a buffer that has the same size as the output
-    and gets added to the "real" output of the operation.  An example
-    of an operation that respects this pattern is GEMM from blas.
-    The decorated function must have this signature::
-        maker(node, *inputs)
-    The `node` argument you receive is the original apply node that
-    contains your op.  You should use it to grab relevant properties
-    for your op so that the new version performs the same computation.
-    The `*inputs` parameters contains the new inputs for your op.  You
-    MUST use those inputs instead of the ones on `node`.  Note that
-    this function can be as simple as::
-        def maker(node, *inputs):
-            return node.op(*inputs)
-    Parameters
-    ----------
-    cls : op class
-        The class of the op you want to merge
-    alpha_in : int
-        The input index for the alpha scalar for your op (in node.inputs).
-    beta_in : int
-        The input index for the beta scalar for your op (in node.inputs).
-    Returns
-    -------
-    local optimizer
-        an unregistered local optimizer that has the same name as the
-        decorated function.
-    Notes
-    -----
-    This was factored out since the code to deal with intervening
-    transfers and correctness in the presence of different values of
-    alpha and beta scaling factors is not trivial.
-    """
-    def wrapper(maker):
-        @local_optimizer([GpuElemwise])
-        @wraps(maker)
-        def opt(fgraph, node):
-            if (
-                isinstance(node.op, GpuElemwise)
-                and node.op.scalar_op == aes.mul
-                and node.nin == 2
-            ):
-                targ = find_node(fgraph, node.inputs[0], cls)
-                if targ is None:
-                    targ = find_node(fgraph, node.inputs[1], cls)
-                    if targ is None:
-                        return
-                    lr = grab_cpu_scalar(node.inputs[0], nd=targ.outputs[0].ndim)
-                else:
-                    lr = grab_cpu_scalar(node.inputs[1], nd=targ.outputs[0].ndim)
-                if lr is None or lr.dtype != targ.outputs[0].dtype:
-                    return None
-                inputs = list(targ.inputs)
-                try:
-                    c = get_scalar_constant_value(lr)
-                    if c == 0:
-                        inputs[alpha_in] = lr
-                        inputs[beta_in] = lr
-                    elif c == 1:
-                        inputs[alpha_in] = targ.inputs[alpha_in]
-                        inputs[beta_in] = targ.inputs[beta_in]
-                    else:
-                        inputs[alpha_in] = lr * targ.inputs[alpha_in]
-                        inputs[beta_in] = lr * targ.inputs[beta_in]
-                except NotScalarConstantError:
-                    inputs[alpha_in] = lr * targ.inputs[alpha_in]
-                    inputs[beta_in] = lr * targ.inputs[beta_in]
-                new_out = maker(targ, *inputs)
-                copy_stack_trace(node.outputs, new_out)
-                return new_out
-        return opt
-    return wrapper
-def output_merge(cls, alpha_in, beta_in, out_in):
-    """
-    Decorator to merge addition by a value on the output.
-    This will find a pattern of `val * <yourop>(some, params, alpha,
-    beta, out_like)` and update it so that the addtition happens as
-    part of your op.
-    The op needs to accept an alpha and a beta scalar which act this way::
-       out = Op() * alpha + out_like * beta
-    Where out_like is a buffer that has the same size as the output
-    and gets added to the "real" output of the operation.  An example
-    of an operation that respects this pattern is GEMM from blas.
-    The decorated function must have this signature::
-        maker(node, *inputs)
-    The `node` argument you receive is the original apply node that
-    contains your op.  You should use it to grab relevant properties
-    for your op so that the new version performs the same computation.
-    The `*inputs` parameters contains the new inputs for your op.  You
-    MUST use those inputs instead of the ones on `node`.  Note that
-    this function can be as simple as::
-        def maker(node, *inputs):
-            return node.op(*inputs)
-    Parameters
-    ----------
-    cls : op class
-        The class of the op you want to merge
-    alpha_in : int
-        The input index for the alpha scalar for your op (in node.inputs).
-    beta_in : int
-        The input index for the beta scalar for your op (in node.inputs).
-    out_in : int
-        The input index for the out_like input for your op (in node.inputs).
-    Returns
-    -------
-    local optimizer
-        an unregistered local optimizer that has the same name as the
-        decorated function.
-    Notes
-    -----
-    This was factored out since the code to deal with intervening
-    transfers and correctness in the presence of different values of
-    alpha and beta scaling factors is not trivial.
-    This also correctly handles the case where the added value is
-    broadcasted (by not performing the replacement).
-    """
-    def wrapper(maker):
-        @local_optimizer([GpuElemwise])
-        @wraps(maker)
-        def opt(fgraph, node):
-            if (
-                isinstance(node.op, GpuElemwise)
-                and node.op.scalar_op == aes.add
-                and node.nin == 2
-            ):
-                targ = find_node(fgraph, node.inputs[0], cls)
-                W = node.inputs[1]
-                if targ is None:
-                    targ = find_node(fgraph, node.inputs[1], cls)
-                    W = node.inputs[0]
-                if targ is None:
-                    return None
-                if W.dtype != targ.outputs[0].dtype:
-                    return None
-                if not is_equal(targ.inputs[beta_in], 0.0):
-                    # other cases are too complex for now
-                    return None
-                if W.broadcastable != targ.inputs[out_in].broadcastable:
-                    # Would need to explicitly tile the output to fill
-                    # the full shape here.  Disable for now.
-                    return None
-                inputs = list(targ.inputs)
-                inputs[out_in] = W
-                dtype = inputs[beta_in].dtype
-                one = aes.constant(np.asarray(1.0, dtype=dtype))
-                inputs[beta_in] = one
-                new_out = maker(targ, *inputs)
-                copy_stack_trace(node.outputs, new_out)
-                return new_out
-        return opt
-    return wrapper
-def inplace_allocempty(op, idx):
-    """
-    Wrapper to make an inplace optimization that deals with AllocEmpty
-    This will duplicate the alloc input if it has more than one client
-    to allow the op to work on it inplace.
-    The decorated function must have this signature::
-        maker(node, inputs)
-    The `node` argument you receive is the original apply node that
-    contains your op.  You should use it to grab relevant properties
-    for your op so that the new version performs the same computation.
-    You should also switch the op to work inplace.  The `*inputs`
-    parameters contains the new inputs for your op.  You MUST use
-    those inputs instead of the ones on `node`.  Note that this
-    function can be as simple as::
-        def maker(node, inputs):
-            return [node.op.__class__(inplace=True)(*inputs)]
-    Parameters
-    ----------
-    op : op class
-        The op class to look for to make inplace
-    idx : int
-        The index of the (possibly) AllocEmpty input (in node.inputs).
-    Returns
-    -------
-    local optimizer
-        an unregistered inplace local optimizer that has the same name
-        as the decorated function.
-    """
-    def wrapper(maker):
-        @local_optimizer([op], inplace=True)
-        @wraps(maker)
-        def opt(fgraph, node):
-            if not isinstance(node.op, op) or node.op.inplace:
-                return
-            inputs = list(node.inputs)
-            alloc = inputs[idx]
-            if (
-                alloc.owner
-                and isinstance(alloc.owner.op, GpuAllocEmpty)
-                and len(fgraph.clients[alloc]) > 1
-            ):
-                alloc_op = GpuAllocEmpty(
-                    alloc.owner.op.dtype, alloc.owner.op.context_name
-                )
-                inputs[idx] = alloc_op(*alloc.owner.inputs)
-            new_out = maker(node, inputs)
-            copy_stack_trace(node.outputs, new_out)
-            return new_out
-        return opt
-    return wrapper
-def pad_dims(input, leftdims, rightdims):
-    """Reshapes the input to a (leftdims + rightdims) tensor
-    This helper function is used to convert pooling inputs with arbitrary
-    non-pooling dimensions to the correct number of dimensions for the
-    GPU pooling ops.
-    This reduces or expands the number of dimensions of the input to
-    exactly `leftdims`, by adding extra dimensions on the left or by
-    combining some existing dimensions on the left of the input.
-    Use `unpad_dims` to reshape back to the original dimensions.
-    Examples
-    --------
-    Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
-    adds a singleton dimension and reshapes to (1, 3, 5, 7).
-    Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
-    reshapes back to (3, 5, 7).
-    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
-    does not reshape and returns output with shape (3, 5, 7, 9).
-    Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
-    combines the first two dimensions and reshapes to (15, 7, 9, 11).
-    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
-    adds a singleton dimension and reshapes to (1, 3, 5, 7, 9).
-    """
-    assert input.ndim >= rightdims
-    if input.ndim == (leftdims + rightdims):
-        return input
-    # extract image dimensions
-    img_shape = input.shape[-rightdims:]
-    non_pool_ndim = input.ndim - rightdims
-    if non_pool_ndim < leftdims:
-        # too few dimensions, pad on the left
-        dummy_dims = as_tensor([1] * (leftdims - non_pool_ndim))
-        new_shape = join(0, dummy_dims, input.shape[:non_pool_ndim], img_shape)
-    else:
-        # too many dimensions, combine the leading dimensions
-        batched_ndim = non_pool_ndim - leftdims + 1
-        batch_size = prod(input.shape[:batched_ndim])
-        # convert to a vector for join
-        batch_size = shape_padright(batch_size, 1)
-        new_shape = join(
-            0, batch_size, input.shape[batched_ndim:non_pool_ndim], img_shape
-        )
-    # store in the required shape
-    new_shape = cast(new_shape, "int64")
-    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
-    return input_ND
-def unpad_dims(output, input, leftdims, rightdims):
-    """Reshapes the output after pad_dims.
-    This reverts the padding by `pad_dims`.
-    """
-    if output.ndim == input.ndim:
-        return output
-    # restore the output to the original shape
-    outshp = join(0, input.shape[:-rightdims], output.shape[-rightdims:])
-    return GpuReshape(input.ndim)(output, outshp)
-def op_lifter(OP, cuda_only=False):
-    """
-    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
-    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
-    """
-    def f(maker):
-        def local_opt(fgraph, node):
-            if isinstance(node.op, OP):
-                # Either one of our inputs is on the gpu or
-                # all of our clients are on the gpu
-                replace = False
-                # TODO: Maybe set context_name with infer_context_name()?
-                context_name = None
-                # We replace if any input is a host_from_gpu
-                for i in node.inputs:
-                    if i.owner and i.owner.op == host_from_gpu and move_to_gpu(i):
-                        context_name = i.owner.inputs[0].type.context_name
-                        replace = True
-                        break
-                if not replace:
-                    # We replace if *all* clients are on the GPU
-                    clients = [c for o in node.outputs for c in fgraph.clients[o]]
-                    replace = len(clients) != 0
-                    for c, idx in clients:
-                        if c == "output" or not isinstance(c.op, GpuFromHost):
-                            replace = False
-                    # TODO: check that the clients want the same context?
-                    if replace:
-                        # All clients are GpuFromHost and we have at least one
-                        context_name = clients[0][0].op.context_name
-                # Check if we should replace
-                if (
-                    not replace
-                    or (cuda_only and get_context(context_name).kind != b"cuda")
-                    or any("complex" in getattr(i, "dtype", "") for i in node.inputs)
-                ):
-                    return False
-                # tag the inputs with the context in case
-                # the context was derived from the outputs
-                for i in node.inputs:
-                    i.tag.context_name = context_name
-                new_op = maker(node.op, context_name, node.inputs, node.outputs)
-                # This is needed as sometimes new_op inherits from OP.
-                if new_op and new_op != node.op:
-                    if isinstance(new_op, Op):
-                        new_outputs = new_op(*node.inputs, return_list=True)
-                        to_cpu_fn = safe_to_cpu
-                    elif isinstance(new_op, (tuple, list)):
-                        new_outputs = new_op
-                        to_cpu_fn = safe_to_cpu
-                    else:  # suppose it is a variable on the GPU
-                        new_outputs = [new_op]
-                        def to_cpu_fn(x):
-                            return x.transfer("cpu")
-                    # copy stack traces onto gpu outputs
-                    # also copy the stack traces onto HostFromGpu outputs
-                    on_cpu = []
-                    for old_output, new_output in zip(node.outputs, new_outputs):
-                        copy_stack_trace(old_output, new_output)
-                        cpu = to_cpu_fn(new_output)
-                        on_cpu.append(cpu)
-                        copy_stack_trace(old_output, cpu)
-                    return on_cpu
-            return False
-        local_opt.__name__ = maker.__name__
-        return local_optimizer(OP)(local_opt)
-    return f
--- a/aesara/gpuarray/optdb.py
+++ b/aesara/gpuarray/optdb.py
-import time
-from aesara.compile import optdb
-from aesara.graph.basic import applys_between
-from aesara.graph.opt import LocalOptGroup, TopoOptimizer, local_optimizer
-from aesara.graph.optdb import (
-    EquilibriumDB,
-    LocalGroupDB,
-    OptimizationDatabase,
-    SequenceDB,
-)
-class GraphToGPULocalOptGroup(LocalOptGroup):
-    """This is the equivalent of `LocalOptGroup` for `GraphToGPU`.
-    The main different is the function signature of the local
-    optimizer that use the `GraphToGPU` signature and not the normal
-    `LocalOptimizer` signature.
-    ``apply_all_opts=True`` is not supported
-    """
-    def __init__(self, *optimizers, **kwargs):
-        super().__init__(*optimizers, **kwargs)
-        assert self.apply_all_opts is False
-    def transform(self, fgraph, op, context_name, inputs, outputs):
-        if len(self.opts) == 0:
-            return
-        for opt in self.tracker.get_trackers(op):
-            opt_start = time.time()
-            new_repl = opt.transform(fgraph, op, context_name, inputs, outputs)
-            opt_finish = time.time()
-            if self.profile:
-                self.time_opts[opt] += opt_start - opt_finish
-                self.process_count[opt] += 1
-            if not new_repl:
-                continue
-            if self.profile:
-                self.node_created[opt] += len(
-                    list(applys_between(fgraph.variables, new_repl))
-                )
-                self.applied_true[opt] += 1
-            return new_repl
-gpu_optimizer = EquilibriumDB()
-gpu_cut_copies = EquilibriumDB()
-# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
-gpu_optimizer2 = EquilibriumDB()
-gpu_seqopt = SequenceDB()
-# do not add 'fast_run' to these two as this would always enable gpuarray mode
-optdb.register(
-    "gpuarray_opt",
-    gpu_seqopt,
-    "gpuarray",
-    position=optdb.__position__.get("add_destroy_handler", 49.5) - 1,
-)
-pool_db = LocalGroupDB()
-pool_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
-pool_db2.__name__ = "pool_db2"
-matrix_ops_db = LocalGroupDB()
-matrix_ops_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
-matrix_ops_db2.__name__ = "matrix_ops_db2"
-abstract_batch_norm_db = LocalGroupDB()
-abstract_batch_norm_db2 = LocalGroupDB(local_opt=GraphToGPULocalOptGroup)
-abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
-abstract_batch_norm_groupopt = LocalGroupDB()
-abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
-def register_opt(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
-        gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
-        return local_opt
-    return f
-def register_opt2(tracks, *tags, **kwargs):
-    """
-    Decorator for the new GraphToGPU optimizer.
-    Takes an extra parameter(Op) compared to register_opt decorator.
-    Parameters
-    ----------
-    tracks : List of Op class Or Op instance or None
-        The Node's Op to which optimization is being applied.
-    tags : String
-        The optimization tag to which the optimizer will be registered.
-    """
-    def f(local_opt):
-        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
-        if isinstance(local_opt, OptimizationDatabase):
-            opt = local_opt
-        else:
-            opt = local_optimizer(tracks)(local_opt)
-        gpu_optimizer2.register(name, opt, "fast_run", "gpuarray", *tags)
-        return local_opt
-    return f
-def register_inplace(*tags, **kwargs):
-    def f(local_opt):
-        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
-        optdb.register(
-            name,
-            TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace),
-            "fast_run",
-            "inplace",
-            "gpuarray",
-            *tags,
-            position=60,
-        )
-        return local_opt
-    return f
-# Register GPU convolution implementation
-# They are tried in a specific order so we can control
-# which ones take precedence over others.
-abstractconv_groupopt = LocalGroupDB()
-abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
-register_opt("fast_compile")(abstractconv_groupopt)
-class GraphToGPUDB(OptimizationDatabase):
-    """
-    Retrieves the list local optimizers based on the optimizer flag's value
-    from EquilibriumOptimizer by calling the method query.
-    """
-    def query(self, *tags, **kwtags):
-        from aesara.gpuarray.opt import GraphToGPU
-        opt = gpu_optimizer2.query(*tags, **kwtags)
-        return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
--- a/aesara/gpuarray/pathparse.py
+++ b/aesara/gpuarray/pathparse.py
-import os
-import sys
-from typing import Set
-class PathParser:
-    """
-    Class that allows to modify system's PATH environment variable
-    at runtime. Currently used in ``aesara.gpuarray.dnn`` module
-    on Windows only.
-    **Examples**:
-    ..code-block:: python
-        aesara.pathparse.PathParser(pathToAdd1, pathToAdd2, ...)
-        # PATH is then automatically updated for this execution.
-    ..code-block:: python
-        paths = aesara.pathparse.PathParser()
-        paths.add(path1)
-        paths.add(path2)
-        # PATH is updated after each call to ``add()``.
-    """
-    paths: Set = set()
-    def _add(self, path):
-        path = path.strip()
-        if path:
-            if sys.platform == "win32":
-                # Windows is case-insensitive.
-                path = path.lower()
-            self.paths.add(os.path.abspath(path))
-    def _update(self):
-        os.environ["PATH"] = os.pathsep.join(sorted(self.paths))
-    def _parse(self):
-        for path in os.environ["PATH"].split(os.pathsep):
-            self._add(path)
-    def __init__(self, *paths):
-        self._parse()
-        for path in paths:
-            self._add(path)
-        self._update()
-    def add(self, path):
-        self._add(path)
-        self._update()
-    def _debug(self):
-        for path in sorted(self.paths):
-            print(path)
--- a/aesara/gpuarray/pool.py
+++ b/aesara/gpuarray/pool.py
-import aesara
-from aesara.gpuarray.basic_ops import (
-    CGpuKernelBase,
-    as_gpuarray_variable,
-    gpu_contiguous,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.type import gpu_context_type
-from aesara.graph.basic import Apply
-from aesara.link.c.params_type import ParamsType
-from aesara.scalar import bool as bool_t
-from aesara.tensor.basic import as_tensor_variable
-from aesara.tensor.signal.pool import Pool, PoolingMode_t
-from aesara.tensor.type import int_dtypes
-try:
-    import pygpu
-except ImportError:
-    # To make sure aesara is importable
-    pass
-class GpuPool(CGpuKernelBase):
-    """
-    Implement the max and average pooling on the gpu.
-    """
-    __props__ = ("ignore_border", "mode", "ndim")
-    params_type = ParamsType(
-        ignore_border=bool_t, mode=PoolingMode_t, context=gpu_context_type
-    )
-    def __init__(self, ignore_border, mode="max", ndim=2):
-        self.ndim = ndim
-        self.ignore_border = ignore_border
-        if mode == "average":
-            mode = "average_inc_pad"
-        self.mode = mode
-        CGpuKernelBase.__init__(self, ["c_code/pool.c"], "APPLY_SPECIFIC(pool)")
-        assert PoolingMode_t.has_alias(self.mode)
-        assert self.ndim in (2, 3)
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def c_headers(self, **kwargs):
-        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir(), pygpu.get_include()]
-    def make_node(self, inp, ws, stride=None, pad=None):
-        ctx_name = infer_context_name(inp)
-        inp = as_gpuarray_variable(inp, ctx_name)
-        nd = self.ndim
-        assert inp.ndim == nd + 2
-        if stride is None:
-            stride = ws
-        if pad is None:
-            pad = (0,) * nd
-        elif isinstance(pad, (tuple, list)):
-            if max(pad) != 0 and not self.ignore_border:
-                raise ValueError("Padding works only with ignore_border=True")
-            if isinstance(ws, (tuple, list)):
-                if any(pad[i] >= ws[i] for i in range(nd)):
-                    raise ValueError("Padding must be smaller than strides")
-        ws = as_tensor_variable(ws)
-        stride = as_tensor_variable(stride)
-        pad = as_tensor_variable(pad)
-        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
-        assert ws.ndim == 1
-        if ws.dtype not in int_dtypes:
-            raise TypeError("Window shape parameters must be ints.")
-        if stride.dtype not in int_dtypes:
-            raise TypeError("Stride parameters must be ints.")
-        if pad.dtype not in int_dtypes:
-            raise TypeError("Padding parameters must be ints.")
-        ws = aesara.tensor.cast(ws, "int64")
-        stride = aesara.tensor.cast(stride, "int64")
-        pad = aesara.tensor.cast(pad, "int64")
-        return Apply(self, [inp, ws, stride, pad], [inp.type()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
-        shp = Pool.out_shape(
-            in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim
-        )
-        return [shp]
-    def grad(self, inp, grads):
-        img, ws, stride, pad = inp
-        (grad,) = grads
-        grad = gpu_contiguous(grad)
-        disc = [aesara.gradient.DisconnectedType()() for i in inp[1:]]
-        if self.mode == "max":
-            out = self(img, ws, stride, pad)
-            g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)(
-                img, out, grad, ws, stride, pad
-            )
-            return [g_out] + disc
-        else:
-            g_out = GpuAveragePoolGrad(
-                ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode
-            )(img, grad, ws, stride, pad)
-            return [g_out] + disc
-    def connection_pattern(self, node):
-        return [[1], [0], [0], [0]]
-    def R_op(self, inputs, eval_points):
-        if self.mode != "max":
-            # Rop for average or sum is simply pooling evaluated at eval point
-            eval_inputs = [eval_points[0]] + inputs[1:]
-            return [self(*eval_inputs)]
-        # R_op can receive None as eval_points.
-        # That mean there is no diferientiable path through that input
-        # If this imply that you cannot compute some outputs,
-        # return None for those.
-        if eval_points[0] is None:
-            return [None]
-        z = self(*inputs)
-        x, ws, stride, pad = inputs
-        return [
-            GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode, self.ndim)(
-                x, z, eval_points[0], ws, stride, pad
-            )
-        ]
-class GpuMaxPoolGrad(CGpuKernelBase):
-    """
-    Implement the grad of max pooling on the gpu.
-    """
-    __props__ = ("ignore_border", "mode", "ndim")
-    def __init__(self, ignore_border, mode="max", ndim=2):
-        self.ndim = ndim
-        self.ignore_border = ignore_border
-        self.mode = mode
-        CGpuKernelBase.__init__(
-            self, ["c_code/pool_max_grad.c"], "APPLY_SPECIFIC(max_pool_grad)"
-        )
-        assert mode == "max"
-        assert ndim in (2, 3)
-    def c_headers(self, **kwargs):
-        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir(), pygpu.get_include()]
-    def make_node(self, inp, out, out_grad, ws, stride=None, pad=None):
-        ctx_name = infer_context_name(inp, out, out_grad)
-        nd = self.ndim
-        inp = as_gpuarray_variable(inp, ctx_name)
-        assert inp.ndim == nd + 2
-        out = as_gpuarray_variable(out, ctx_name)
-        assert out.ndim == nd + 2
-        out_grad = as_gpuarray_variable(out_grad, ctx_name)
-        assert out_grad.ndim == nd + 2
-        assert out_grad.ndim == inp.ndim
-        assert inp.ndim == out.ndim
-        if stride is None:
-            stride = ws
-        if pad is None:
-            pad = (0,) * nd
-        ws = as_tensor_variable(ws)
-        stride = as_tensor_variable(stride)
-        pad = as_tensor_variable(pad)
-        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
-        assert ws.ndim == 1
-        if ws.dtype not in int_dtypes:
-            raise TypeError("Window shape parameters must be ints.")
-        if stride.dtype not in int_dtypes:
-            raise TypeError("Stride parameters must be ints.")
-        if pad.dtype not in int_dtypes:
-            raise TypeError("Padding parameters must be ints.")
-        ws = aesara.tensor.cast(ws, "int64")
-        stride = aesara.tensor.cast(stride, "int64")
-        pad = aesara.tensor.cast(pad, "int64")
-        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        return [in_shapes[0]]
-    def grad(self, inp, grads):
-        x, maxout, gz, ws, stride, pad = inp
-        (ggx,) = grads
-        return [
-            aesara.tensor.zeros_like(x),
-            aesara.tensor.zeros_like(maxout),
-            GpuDownsampleFactorMaxGradGrad(
-                ndim=self.ndim, ignore_border=self.ignore_border
-            )(x, maxout, ggx, ws, stride, pad),
-        ] + [aesara.gradient.DisconnectedType()() for i in inp[3:]]
-    def connection_pattern(self, node):
-        return [[1], [1], [1], [0], [0], [0]]
-class GpuAveragePoolGrad(CGpuKernelBase):
-    """
-    Implement the grad of average pooling on the gpu.
-    """
-    __props__ = ("ignore_border", "mode", "ndim")
-    params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)
-    def __init__(self, ignore_border, mode="max", ndim=2):
-        self.ndim = ndim
-        self.ignore_border = ignore_border
-        if mode == "average":
-            mode = "average_inc_pad"
-        self.mode = mode
-        CGpuKernelBase.__init__(
-            self, ["c_code/pool_ave_grad.c"], "APPLY_SPECIFIC(ave_pool_grad)"
-        )
-        assert mode in ("sum", "average_inc_pad", "average_exc_pad")
-        assert ndim in (2, 3)
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def c_headers(self, **kwargs):
-        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir(), pygpu.get_include()]
-    def make_node(self, inp, out_grad, ws, stride=None, pad=None):
-        ctx_name = infer_context_name(inp, out_grad)
-        nd = self.ndim
-        inp = as_gpuarray_variable(inp, ctx_name)
-        assert inp.ndim == nd + 2
-        out_grad = as_gpuarray_variable(out_grad, ctx_name)
-        assert out_grad.ndim == nd + 2
-        assert out_grad.ndim == inp.ndim
-        if stride is None:
-            stride = ws
-        if pad is None:
-            pad = (0,) * nd
-        elif isinstance(pad, (tuple, list)):
-            if max(pad) != 0 and self.mode != "average_exc_pad":
-                raise ValueError("Padding must be zero for average_exc_pad")
-        ws = as_tensor_variable(ws)
-        stride = as_tensor_variable(stride)
-        pad = as_tensor_variable(pad)
-        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
-        assert ws.ndim == 1
-        if ws.dtype not in int_dtypes:
-            raise TypeError("Window shape parameters must be ints.")
-        if stride.dtype not in int_dtypes:
-            raise TypeError("Stride parameters must be ints.")
-        if pad.dtype not in int_dtypes:
-            raise TypeError("Padding parameters must be ints.")
-        ws = aesara.tensor.cast(ws, "int64")
-        stride = aesara.tensor.cast(stride, "int64")
-        pad = aesara.tensor.cast(pad, "int64")
-        return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        return [in_shapes[0]]
-    def grad(self, inp, grads):
-        x, gz, ws, stride, pad = inp
-        (ggx,) = grads
-        return [
-            aesara.tensor.zeros_like(x),
-            GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)(
-                ggx, ws, stride, pad
-            ),
-        ] + [aesara.gradient.DisconnectedType()() for i in inp[2:]]
-    def connection_pattern(self, node):
-        return [[1], [1], [0], [0], [0]]
-class GpuDownsampleFactorMaxGradGrad(CGpuKernelBase):
-    """
-    Implement the grad of downsample with max on the gpu.
-    """
-    __props__ = ("ignore_border", "mode", "ndim")
-    def __init__(self, ignore_border, mode="max", ndim=2):
-        self.ndim = ndim
-        self.ignore_border = ignore_border
-        self.mode = mode
-        CGpuKernelBase.__init__(
-            self, ["c_code/pool_grad_grad.c"], "APPLY_SPECIFIC(pool_grad_grad)"
-        )
-        assert self.mode == "max"
-        assert self.ndim in (2, 3)
-    def c_headers(self, **kwargs):
-        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir(), pygpu.get_include()]
-    def make_node(self, inp, out, out_grad, ws, stride=None, pad=None):
-        ctx_name = infer_context_name(inp, out, out_grad)
-        nd = self.ndim
-        inp = as_gpuarray_variable(inp, ctx_name)
-        assert inp.ndim == nd + 2
-        out = as_gpuarray_variable(out, ctx_name)
-        assert out_grad.ndim == nd + 2
-        out_grad = as_gpuarray_variable(out_grad, ctx_name)
-        assert out.ndim == nd + 2
-        assert out_grad.ndim == inp.ndim
-        assert inp.ndim == out.ndim
-        if stride is None:
-            stride = ws
-        if pad is None:
-            pad = (0,) * nd
-        ws = as_tensor_variable(ws)
-        stride = as_tensor_variable(stride)
-        pad = as_tensor_variable(pad)
-        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
-        assert ws.ndim == 1
-        if ws.dtype not in int_dtypes:
-            raise TypeError("Window shape parameters must be ints.")
-        if stride.dtype not in int_dtypes:
-            raise TypeError("Stride parameters must be ints.")
-        if pad.dtype not in int_dtypes:
-            raise TypeError("Padding parameters must be ints.")
-        ws = aesara.tensor.cast(ws, "int64")
-        stride = aesara.tensor.cast(stride, "int64")
-        pad = aesara.tensor.cast(pad, "int64")
-        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        return [in_shapes[1]]
-    def grad(self, inp, grads):
-        x, maxout, ggx, ws, stride, pad = inp
-        (gz,) = grads
-        return [
-            aesara.tensor.zeros_like(x),
-            aesara.tensor.zeros_like(maxout),
-            GpuMaxPoolGrad(ignore_border=self.ignore_border, ndim=self.ndim)(
-                x, maxout, gz, ws, stride, pad
-            ),
-        ] + [aesara.gradient.DisconnectedType()() for i in inp[3:]]
-    def connection_pattern(self, node):
-        return [[1], [1], [1], [0], [0], [0]]
-class GpuMaxPoolRop(CGpuKernelBase):
-    """
-    Implements the R-operator for the downsample operation.
-    """
-    __props__ = ("ignore_border", "mode", "ndim")
-    params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type)
-    def __init__(self, ignore_border, mode="max", ndim=2):
-        self.ndim = ndim
-        self.ignore_border = ignore_border
-        self.mode = mode
-        CGpuKernelBase.__init__(
-            self, ["c_code/pool_max_rop.c"], "APPLY_SPECIFIC(max_pool_rop)"
-        )
-        assert mode == "max"
-        assert ndim in (2, 3)
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    def c_headers(self, **kwargs):
-        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir(), pygpu.get_include()]
-    def make_node(self, inp, eval_point, ws, stride=None, pad=None):
-        ctx_name = infer_context_name(inp)
-        nd = self.ndim
-        inp = as_gpuarray_variable(inp, ctx_name)
-        assert inp.ndim == nd + 2
-        eval_point = as_gpuarray_variable(eval_point, ctx_name)
-        assert eval_point.ndim == nd + 2
-        if stride is None:
-            stride = ws
-        if pad is None:
-            pad = (0,) * nd
-        elif isinstance(pad, (tuple, list)):
-            if max(pad) != 0 and not self.ignore_border:
-                raise ValueError("Padding works only with ignore_border=True")
-            if isinstance(ws, (tuple, list)):
-                if any(pad[i] >= ws[i] for i in range(nd)):
-                    raise ValueError("Padding must be smaller than strides")
-        ws = as_tensor_variable(ws)
-        stride = as_tensor_variable(stride)
-        pad = as_tensor_variable(pad)
-        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
-        assert ws.ndim == 1
-        if ws.dtype not in int_dtypes:
-            raise TypeError("Window shape parameters must be ints.")
-        if stride.dtype not in int_dtypes:
-            raise TypeError("Stride parameters must be ints.")
-        if pad.dtype not in int_dtypes:
-            raise TypeError("Padding parameters must be ints.")
-        ws = aesara.tensor.cast(ws, "int64")
-        stride = aesara.tensor.cast(stride, "int64")
-        pad = aesara.tensor.cast(pad, "int64")
-        return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
-    def infer_shape(self, fgraph, node, in_shapes):
-        ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
-        shp = Pool.out_shape(
-            in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim
-        )
-        return [shp]
--- a/aesara/gpuarray/reduction.py
+++ b/aesara/gpuarray/reduction.py
-from aesara.graph.basic import Apply
-from aesara.link.c.op import COp
-from aesara.link.c.type import Generic
-from .basic_ops import as_gpuarray_variable, gpuarray_helper_inc_dir, infer_context_name
-from .type import GpuArrayType
-try:
-    import pygpu
-except ImportError:
-    pass
-class GpuMaxAndArgmax(COp):
-    """
-    GPU version of MaxAndArgmax
-    """
-    params_type = Generic()
-    __props__ = ("axis",)
-    argmax_dtype = "int64"
-    def __init__(self, axis):
-        assert isinstance(axis, (list, tuple))
-        self.axis = tuple(axis)
-    def get_params(self, node):
-        return self.axis
-    def make_node(self, X):
-        context_name = infer_context_name(X)
-        # We keep the original broadcastable flags for dimensions on which
-        # we do not perform the max / argmax.
-        all_axes = set(self.axis)
-        broadcastable = [
-            b for i, b in enumerate(X.type.broadcastable) if i not in all_axes
-        ]
-        inputs = [as_gpuarray_variable(X, context_name)]
-        outputs = [
-            GpuArrayType(X.type.dtype, broadcastable, context_name=context_name)(),
-            GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name)(),
-        ]
-        return Apply(self, inputs, outputs)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray_helper.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [pygpu.get_include(), gpuarray_helper_inc_dir()]
-    def c_code(self, node, name, input_names, output_names, sub):
-        # Recall: X = input_names[0]
-        # Recall: axes = sub['params']
-        # Recall: max, argmax = output_names
-        # Recall: fail = sub['fail']
-        max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
-        argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
-        ret = """
-        #if PY_MAJOR_VERSION >= 3
-            #ifndef PyInt_AS_LONG
-                #define PyInt_AS_LONG PyLong_AS_LONG
-            #endif
-        #endif
-        int err = 0;
-        unsigned  %(name)s_redux_len = PyTuple_GET_SIZE(%(axes)s);
-        unsigned* %(name)s_axes_to_reduce = (unsigned*)malloc(%(name)s_redux_len * sizeof(unsigned));
-        for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
-            PyObject* axis_object = PyTuple_GET_ITEM(%(axes)s, i);
-            %(name)s_axes_to_reduce[i] = (unsigned) PyInt_AS_LONG(axis_object);
-        }
-        size_t  %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
-        size_t  %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
-        size_t* %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
-        if (%(name)s_redux_len == 1) {
-            for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
-                %(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
-            }
-            for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
-                %(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
-            }
-        } else {
-            int64_t current_input_pos = -1;
-            int64_t current_output_pos = -1;
-            for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
-                for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
-                    %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
-                }
-            }
-            for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
-                %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
-            }
-        }
-        if (aesara_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
-            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare max output.");
-            %(fail)s
-        }
-        if (aesara_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
-            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare argmax output.");
-            %(fail)s
-        }
-        if (%(name)s_input_ndim == 0) {
-            /* GpuArray_maxandargmax can't handle a 0-d array
-             * because it expects that 1 <= redux_len <= input_ndim.
-             * As input_ndim == 0, then 1 <= redux_len <= 0 is false.
-             * To handle this case we copy input to max and we set argmax to 0.
-             */
-            if (GA_NO_ERROR != GpuArray_setarray(&%(max)s->ga, &%(X)s->ga)) {
-                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to copy input to max when input is a scalar.");
-                %(fail)s
-            }
-            if (GA_NO_ERROR != GpuArray_memset(&%(argmax)s->ga, 0)) {
-                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to set argmax to 0 when input is a scalar.");
-                %(fail)s
-            }
-        } else if (GA_NO_ERROR != (err =
-            GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, &%(X)s->ga, %(name)s_redux_len, %(name)s_axes_to_reduce)
-        )) {
-            PyErr_Format(PyExc_RuntimeError,
-                "GpuMaxAndArgmax: unable to compute gpuarray maxandargmax: error %%d: %%s (%%s).",
-                err, gpuarray_error_str(err), GpuArray_error(&%(X)s->ga, err));
-            %(fail)s
-        }
-        """
-        return ret % {
-            "X": input_names[0],
-            "axes": sub["params"],
-            "max": output_names[0],
-            "argmax": output_names[1],
-            "max_typecode": max_typecode,
-            "argmax_typecode": argmax_typecode,
-            "name": name,
-            "fail": sub["fail"],
-        }
-    def c_code_cleanup(self, node, name, inputs, outputs, sub):
-        return """
-        free(%(name)s_output_dims);
-        free(%(name)s_axes_to_reduce);
-        """ % {
-            "name": name,
-        }
-    def c_code_cache_version(self):
-        return (2,)
--- a/aesara/gpuarray/rng_mrg.py
+++ b/aesara/gpuarray/rng_mrg.py
-"""
-GPU implementation of MRG31k3p random number generator for Aesara.
-Generator code in SSJ package (L'Ecuyer & Simard).
-http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
-"""
-from aesara import tensor as at
-from aesara.gpuarray.basic_ops import (
-    GpuFromHost,
-    GpuKernelBase,
-    Kernel,
-    as_gpuarray_variable,
-    host_from_gpu,
-    infer_context_name,
-)
-from aesara.gpuarray.fp16_help import write_w
-from aesara.gpuarray.opt import register_opt, register_opt2
-from aesara.gpuarray.type import GpuArrayType, gpu_context_type
-from aesara.graph.basic import Apply
-from aesara.graph.opt import local_optimizer
-from aesara.sandbox.rng_mrg import mrg_uniform, mrg_uniform_base
-from aesara.scalar import int32 as int_t
-from aesara.tensor import as_tensor_variable, get_vector_length
-class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
-    # GpuArray version
-    _f16_ok = True
-    params_type = mrg_uniform_base.params_type.extended(
-        otypecode=int_t, context=gpu_context_type
-    )
-    otypecode = property(lambda self: self.output_type.typecode)
-    def make_node(self, rstate, size):
-        # error checking slightly redundant here, since
-        # this op should not be called directly.
-        #
-        # call through MRG_RandomStream instead.
-        broad = []
-        for i in range(self.output_type.ndim):
-            broad.append(at.extract_constant(size[i]) == 1)
-        output_type = self.output_type.clone(broadcastable=broad)()
-        rstate = as_gpuarray_variable(rstate, infer_context_name(rstate))
-        return Apply(self, [rstate, size], [rstate.type(), output_type])
-    def get_params(self, node):
-        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-    @classmethod
-    def new(cls, rstate, ndim, dtype, size):
-        v_size = as_tensor_variable(size)
-        if ndim is None:
-            ndim = get_vector_length(v_size)
-        op = cls(GpuArrayType(dtype, (False,) * ndim))
-        return op(rstate, v_size)
-    def c_headers(self, **kwargs):
-        return super().c_headers(**kwargs) + ["numpy_compat.h"]
-    def gpu_kernels(self, node, name):
-        write = write_w(self.output_type.dtype)
-        if self.output_type.dtype == "float16":
-            otype = "ga_half"
-            # limit the values of the state that we use.
-            mask = "& 0x7fff"
-            offset = "+ 1"
-            NORM = "3.0458e-05f"  # numpy.float16(1.0/(2**15+33))
-            # this was determined by finding the biggest number such that
-            # numpy.float16(number * ((M1 & 0x7fff) + 1)) < 1.0
-        elif self.output_type.dtype == "float32":
-            otype = "float"
-            mask = ""
-            offset = ""
-            NORM = "4.6566126e-10f"  # numpy.float32(1.0/(2**31+65))
-            # this was determined by finding the biggest number such that
-            # numpy.float32(number * M1) < 1.0
-        elif self.output_type.dtype == "float64":
-            otype = "double"
-            mask = ""
-            offset = ""
-            NORM = "4.656612873077392578125e-10"
-        else:
-            raise ValueError("Unsupported data type for output", self.output_type.dtype)
-        code = (
-            """#include "cluda.h"
-        KERNEL void mrg_uniform(
-                GLOBAL_MEM %(otype)s *sample_data,
-                ga_size sample_offset,
-                GLOBAL_MEM ga_int *state_data,
-                ga_size state_offset,
-                const ga_uint Nsamples,
-                const ga_uint Nstreams_used)
-        {
-            sample_data = (GLOBAL_MEM %(otype)s *)(((GLOBAL_MEM char *)sample_data) + sample_offset);
-            state_data = (GLOBAL_MEM ga_int *)(((GLOBAL_MEM char *)state_data) + state_offset);
-            /*
-             * The cluda backend makes sure that ga_int corresponds to
-             * a 32 bit signed type on the target device.  It is not a
-             * variable width type.
-             */
-            const ga_int i7 = 7;
-            const ga_int i9 = 9;
-            const ga_int i15 = 15;
-            const ga_int i16 = 16;
-            const ga_int i22 = 22;
-            const ga_int i24 = 24;
-            const ga_int M1 = 2147483647;      //2^31 - 1
-            const ga_int M2 = 2147462579;      //2^31 - 21069
-            const ga_int MASK12 = 511;       //2^9 - 1
-            const ga_int MASK13 = 16777215;  //2^24 - 1
-            const ga_int MASK2 = 65535;      //2^16 - 1
-            const ga_int MULT2 = 21069;
-            const ga_uint idx = GID_0 * LDIM_0 + LID_0;
-            ga_int y1, y2, x11, x12, x13, x21, x22, x23;
-            if (idx < Nstreams_used)
-            {
-            x11 = state_data[idx*6+0];
-            x12 = state_data[idx*6+1];
-            x13 = state_data[idx*6+2];
-            x21 = state_data[idx*6+3];
-            x22 = state_data[idx*6+4];
-            x23 = state_data[idx*6+5];
-            for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
-            {
-                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                y1 += x13;
-                y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
-                x13 = x12;
-                x12 = x11;
-                x11 = y1;
-                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
-                y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
-                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += x23;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                y2 += y1;
-                y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
-                x23 = x22;
-                x22 = x21;
-                x21 = y2;
-                if (x11 <= x21) {
-                    sample_data[i] = %(write)s((((x11 - x21 + M1) %(mask)s) %(offset)s) * %(NORM)s);
-                }
-                else
-                {
-                    sample_data[i] = %(write)s((((x11 - x21) %(mask)s) %(offset)s) * %(NORM)s);
-                }
-            }
-            state_data[idx*6+0]= x11;
-            state_data[idx*6+1]= x12;
-            state_data[idx*6+2]= x13;
-            state_data[idx*6+3]= x21;
-            state_data[idx*6+4]= x22;
-            state_data[idx*6+5]= x23;
-            }
-        }
-        """
-            % locals()
-        )
-        # we shouldn't get to this line if it's about to fail
-        from pygpu import gpuarray
-        return [
-            Kernel(
-                code=code,
-                name="mrg_uniform",
-                params=[
-                    gpuarray.GpuArray,
-                    gpuarray.SIZE,
-                    gpuarray.GpuArray,
-                    gpuarray.SIZE,
-                    "uint32",
-                    "uint32",
-                ],
-                flags=Kernel.get_flags(self.output_type.dtype, "int32"),
-            )
-        ]
-    def c_code(self, node, nodename, inp, out, sub):
-        return """
-        npy_int64 M1 = 2147483647;      //2^31 - 1
-        size_t n_elements = 1;
-        unsigned int n_streams;
-        int must_alloc_sample = ((NULL == %(o_sample)s)
-                || !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
-                || !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
-                || (PyGpuArray_NDIM(%(o_sample)s) != %(params)s->ndim));
-        size_t* odims = (size_t*)malloc(%(params)s->ndim * sizeof(size_t));
-        if (odims == NULL) {
-            PyErr_NoMemory();
-            %(just_fail)s
-        }
-        if (PyArray_NDIM(%(size)s) != 1)
-        {
-            PyErr_SetString(PyExc_ValueError, "size must be vector");
-            %(fail)s
-        }
-        if (PyArray_DIMS(%(size)s)[0] != %(params)s->ndim)
-        {
-            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
-                %(params)s->ndim, PyArray_DIMS(%(size)s)[0]);
-            %(fail)s
-        }
-        for (int i = 0; i < %(params)s->ndim; ++i)
-        {
-            odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
-            n_elements *= odims[i];
-            must_alloc_sample = (must_alloc_sample
-                    || PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
-        }
-        if (n_elements > M1)
-        {
-            PyErr_SetString(
-                PyExc_ValueError,
-                "rng_mrg gpu implementation does not support more than (2**31 -1) samples");
-            %(fail)s
-        }
-        if (must_alloc_sample)
-        {
-            Py_XDECREF(%(o_sample)s);
-            %(o_sample)s = pygpu_empty(%(params)s->ndim, odims, %(params)s->otypecode, GA_C_ORDER,
-                                       %(params)s->context, Py_None);
-            if(!%(o_sample)s)
-            {
-                %(fail)s;
-            }
-        }
-        if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
-            %(fail)s;
-        }
-        Py_XDECREF(%(o_rstate)s);
-        if (%(params)s->inplace)
-        {
-            Py_INCREF(%(rstate)s);
-            %(o_rstate)s = %(rstate)s;
-        }
-        else
-        {
-            %(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
-            if (!%(o_rstate)s) {
-                %(fail)s
-            }
-        }
-        if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
-        {
-            PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
-            %(fail)s
-        }
-        if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
-        {
-            PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
-            %(fail)s
-        }
-        if (%(o_rstate)s->ga.typecode != GA_INT) {
-            PyErr_Format(PyExc_ValueError, "rstate must be int32");
-            %(fail)s
-        }
-        if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
-            PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
-            %(fail)s
-        }
-        n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
-        if (n_streams > n_elements)
-          n_streams = n_elements;
-        if (n_elements > 0){
-          size_t ls = 0, gs = 0;
-          int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
-          if (err != GA_NO_ERROR) {
-              PyErr_Format(PyExc_RuntimeError, "GpuKernel_sched: %%s\\n",
-                           GpuKernel_error(&%(kname)s, err));
-              %(fail)s
-          }
-          // Make sure we run as many blocks as we need to cover the whole n_streams
-          gs = (n_streams + ls - 1)/ls;
-          err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_sample)s->ga.offset, %(o_rstate)s->ga.data, %(o_rstate)s->ga.offset, n_elements, n_streams);
-          if (err != GA_NO_ERROR) {
-              PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
-                           GpuKernel_error(&%(kname)s, err));
-              %(fail)s
-          }
-        }
-        free(odims);
-        """ % dict(
-            rstate=inp[0],
-            size=inp[1],
-            o_rstate=out[0],
-            o_sample=out[1],
-            kname=self.gpu_kernels(node, nodename)[0].objvar,
-            params=sub["params"],
-            just_fail=sub["fail"],
-            fail="""
-                   {
-                     free(odims);
-                     %(fail)s
-                   }
-                   """
-            % dict(fail=sub["fail"]),
-        )
-    def c_code_cache_version(self):
-        return (17,)
-@register_opt2([mrg_uniform], "fast_compile")
-def local_gpua_mrg_graph(fgraph, op, context_name, inputs, outputs):
-    if (
-        isinstance(op, mrg_uniform)
-        and isinstance(inputs[0].type, GpuArrayType)
-        and (inputs[0].owner is None or not isinstance(inputs[0].owner.op, GpuFromHost))
-    ):
-        outs = GPUA_mrg_uniform.new(
-            inputs[0], op.output_type.ndim, op.output_type.dtype, inputs[1]
-        )
-        return [outs[0], host_from_gpu(outs[1])]
-@register_opt("fast_compile")
-@local_optimizer([mrg_uniform])
-def local_gpua_mrg(fgraph, node):
-    context_name = infer_context_name(*node.inputs)
-    return local_gpua_mrg_graph(
-        fgraph, node.op, context_name, node.inputs, node.outputs
-    )
--- a/aesara/gpuarray/sort.py
+++ b/aesara/gpuarray/sort.py
-import os
-from string import Template
-import numpy as np
-import aesara
-from aesara.graph.basic import Apply
-from aesara.tensor import as_tensor_variable
-from aesara.tensor.sort import TopKOp
-from .basic_ops import (
-    GpuKernelBase,
-    Kernel,
-    as_gpuarray_variable,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from .opt import op_lifter, register_opt, register_opt2
-from .type import GpuArrayType
-try:
-    import pygpu
-    import pygpu.gpuarray as ga
-except ImportError:
-    # To make sure aesara is importable
-    pass
-# TODO GPU sort / argsort
-class GpuTopKOp(GpuKernelBase, TopKOp):
-    """Implements TopKOp on gpu
-    Currently the output seem sorted, but we do not test it. So as on
-    the CPU, we only support sorted=False for now.
-    """
-    __props__ = TopKOp.__props__
-    _f16_ok = True
-    def __init__(
-        self,
-        axis=-1,
-        sorted=True,
-        idx_dtype="int64",
-        return_values=True,
-        return_indices=True,
-    ):
-        if sorted:
-            raise NotImplementedError(
-                "GpuTopK currently is not sure to give sorted output even if they look sorted.."
-            )
-        GpuKernelBase.__init__(self)
-        TopKOp.__init__(
-            self,
-            axis=axis,
-            sorted=sorted,
-            idx_dtype=idx_dtype,
-            return_values=return_values,
-            return_indices=return_indices,
-        )
-    def perform(self, node, inputs, output_storage, params):
-        raise NotImplementedError()
-    def c_headers(self, **kwargs):
-        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]
-    def c_header_dirs(self, **kwargs):
-        return [
-            os.path.dirname(__file__),
-            gpuarray_helper_inc_dir(),
-            pygpu.get_include(),
-        ]
-    def c_code_cache_version(self):
-        return (4,)
-    def gpu_kernels(self, node, nodename):
-        # load kernel source
-        device_type = node.inputs[0].type.context.kind
-        kernel_ext = {b"cuda": ".cu", b"opencl": ".cl"}[device_type]
-        common_ext = {b"cuda": ".cuh", b"opencl": ".h"}[device_type]
-        # prepare "$" macros
-        if device_type == b"cuda":
-            ndim = node.inputs[0].ndim
-            dstv_strides_code = "".join(
-                f"ssize_t dstv_strides_{i}, " for i in range(ndim)
-            )
-            dsti_strides_code = "".join(
-                f"ssize_t dsti_strides_{i}, " for i in range(ndim)
-            )
-            src_strides_code = "".join(
-                f"ssize_t src_strides_{i}, " for i in range(ndim)
-            )
-            set_slice_code = """
-        gidx = gid %% dims_%(i)d;
-        gid /= dims_%(i)d;
-        {dstv};
-        {dsti};
-        src = ptr_add(src, gidx*src_strides_%(i)d);\n""".format(
-                dstv="dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)"
-                if self.return_values
-                else "",
-                dsti="dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)"
-                if self.return_indices
-                else "",
-            )
-            set_slice_code = "".join(set_slice_code % dict(i=j) for j in range(1, ndim))
-            if self.return_values:
-                set_slice_code += """
-                dstv = ptr_add(dstv, dstv_offset);
-                """
-            if self.return_indices:
-                set_slice_code += """
-                dsti = ptr_add(dsti, dsti_offset);
-                """
-            set_slice_code += """
-                src = ptr_add(src, src_offset);
-            """
-            flags = Kernel.get_flags(node.inputs[0].dtype)
-            subs = dict(
-                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
-                out_t=ga.dtype_to_ctype(self.idx_dtype),
-                dims="".join(f"size_t dims_{i}, " for i in range(1, ndim)),
-                dstv="INPUT_TYPE *dstv," if self.return_values else "",
-                dstv_offset="size_t dstv_offset," if self.return_values else "",
-                dsti="INDEX_TYPE *dsti," if self.return_indices else "",
-                dsti_offset="size_t dsti_offset," if self.return_indices else "",
-                dstv_strides=dstv_strides_code if self.return_values else "",
-                dsti_strides=dsti_strides_code if self.return_indices else "",
-                src_strides=src_strides_code,
-                set_slice=set_slice_code,
-                write_value=int(self.return_values),
-                write_index=int(self.return_indices),
-                ndim=str(ndim),
-            )
-        elif device_type == b"opencl":
-            raise NotImplementedError()
-        # setup parameters
-        param_types = [ga.SIZE] * (ndim - 1)  # dims
-        for _ in range(self.return_values + self.return_indices):
-            param_types.append(ga.GpuArray)  # dst*
-            param_types.append(ga.SIZE)  # offset
-            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
-        param_types.append(ga.SIZE)  # k
-        param_types.append(ga.GpuArray)  # src
-        param_types.append(ga.SIZE)  # offset
-        param_types.extend([ga.SSIZE] * ndim)  # src_strides
-        param_types.append(ga.SIZE)  # size
-        # load and compile kernels
-        with open(
-            os.path.join(
-                os.path.dirname(__file__), "c_code", "topk_common" + common_ext
-            )
-        ) as f:
-            common_src = f.read()
-        kernels = []
-        def build_kernel(fname, kname, subs):
-            with open(os.path.join(os.path.dirname(__file__), "c_code", fname)) as f:
-                kernel_src = f.read()
-            ker = Kernel(
-                code=(
-                    "#include <cluda.h>\n"
-                    + Template(common_src + kernel_src).substitute(**subs)
-                ),
-                name=kname,
-                params=param_types,
-                flags=flags,
-                objvar=kname + nodename,
-            )
-            return ker
-        subs["count_t"] = "int"
-        kernels.append(build_kernel("topk_dense" + kernel_ext, "k_topk_dense", subs))
-        subs["kname"] = "k_topk_dense_large"
-        kernels.append(
-            build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_large", subs)
-        )
-        subs["count_t"] = "long long"
-        subs["kname"] = "k_topk_dense_xlarge"
-        kernels.append(
-            build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_xlarge", subs)
-        )
-        return kernels
-    def c_code(self, node, nodename, inps, outs, sub):
-        context = node.inputs[0].type.context
-        if context.kind != b"cuda":
-            raise NotImplementedError(
-                f"{self.__class__.__name__}: We only have CUDA implementation so far."
-            )
-        x, k = inps
-        inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
-        if not self.return_indices:
-            (yv,) = outs
-        elif self.return_values:
-            yv, yi = outs
-        else:
-            (yi,) = outs
-        out_dtype_s = self.idx_dtype
-        out_dtc = ga.dtype_to_typecode(out_dtype_s)
-        fail = sub["fail"]
-        ctx = sub["params"]
-        k_dtype = node.inputs[1].type.dtype_specs()[1]
-        # max threads per block
-        MAX_TPB = context.maxlsize0
-        # max blocks per grid
-        MAX_BPG = context.maxgsize0
-        WARP_SIZE = 32
-        ndim = node.inputs[0].ndim
-        reordered_axes = list(range(ndim))
-        axis = self.axis % ndim
-        del reordered_axes[axis]
-        reordered_axes = [axis] + reordered_axes
-        dims = "".join(f"dims[{i}], " for i in reordered_axes[1:])
-        prep_output = ""
-        if self.return_values:
-            def_dvstrides = f"const ssize_t *dvstrides = PyGpuArray_STRIDES({yv})"
-            params_dv = f"{yv}->ga.data, {yv}->ga.offset,\n"
-            params_dv += "".join(f"dvstrides[{i}], " for i in reordered_axes)
-            prep_output += (
-                """
-    if (0 != aesara_prep_output(
-        &%(yv)s, %(ndim)d, odims,
-        %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
-        %(fail)s;
-    }\n"""
-                % locals()
-            )
-        else:
-            def_dvstrides = params_dv = ""
-        if self.return_indices:
-            def_distrides = f"const ssize_t *distrides = PyGpuArray_STRIDES({yi})"
-            params_di = f"{yi}->ga.data, {yi}->ga.offset,\n"
-            params_di += "".join(f"distrides[{i}], " for i in reordered_axes)
-            prep_output += (
-                """
-    if (0 != aesara_prep_output(
-        &%(yi)s, %(ndim)d, odims,
-        %(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
-        %(fail)s;
-    }\n"""
-                % locals()
-            )
-        else:
-            def_distrides = params_di = ""
-        sstrides = ", ".join(f"sstrides[{i}]" for i in reordered_axes)
-        code = """
-{
-    const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
-    const size_t *dims = PyGpuArray_DIMS(%(x)s);
-    size_t odims[%(ndim)d];
-    for (int i=0; i<%(ndim)d; i++)
-        odims[i] = dims[i];
-    odims[%(axis)d] = k_>=0 ? k_ : -k_;
-    if (0 == odims[%(axis)d]) {
-        PyErr_SetString(
-            PyExc_ValueError,
-            "topk: kth must not be zero");
-        %(fail)s;
-    } else if (dims[%(axis)d] < odims[%(axis)d]) {
-        PyErr_SetString(
-            PyExc_ValueError,
-            "topk: kth cannot be larger than the size of specified axis %(axis)d");
-        %(fail)s;
-    }
-    %(prep_output)s
-    size_t grid_size=1, block_size=1;
-    for (int i=0; i<%(ndim)d; ++i) {
-        if (i!=%(axis)d)
-            grid_size *= dims[i];
-        else
-            block_size = dims[i];
-    }
-    // round up to multiples of warp size
-    block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;
-    if (grid_size > %(MAX_BPG)d) {
-        PyErr_SetString(
-            PyExc_ValueError,
-            "topk: too many slices to work with, expected <= %(MAX_BPG)d");
-        %(fail)s;
-    }
-    %(def_dvstrides)s;
-    %(def_distrides)s;
-    const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
-    int err;
-    if (dims[%(axis)d] > (1u << 31)) {
-        block_size = %(MAX_TPB)d;
-        err = k_topk_dense_xlarge_call(
-                1, &grid_size, &block_size, 0,
-                %(dims)s
-                %(params_dv)s
-                %(params_di)s
-                k_,
-                %(x)s->ga.data,
-                %(x)s->ga.offset,
-                %(sstrides)s,
-                dims[%(axis)d]
-        );
-    } else if (block_size > %(MAX_TPB)d) {
-        block_size = %(MAX_TPB)d;
-        err = k_topk_dense_large_call(
-                1, &grid_size, &block_size, 0,
-                %(dims)s
-                %(params_dv)s
-                %(params_di)s
-                k_,
-                %(x)s->ga.data,
-                %(x)s->ga.offset,
-                %(sstrides)s,
-                dims[%(axis)d]
-        );
-    } else {
-        err = k_topk_dense_call(
-                1, &grid_size, &block_size, 0,
-                %(dims)s
-                %(params_dv)s
-                %(params_di)s
-                k_,
-                %(x)s->ga.data,
-                %(x)s->ga.offset,
-                %(sstrides)s,
-                dims[%(axis)d]
-        );
-    }
-    if (err != GA_NO_ERROR) {
-        PyErr_SetString(
-            PyExc_RuntimeError,
-            "topk: gpu kernel failed to execute");
-        %(fail)s;
-    }
-}
-        """
-        return code % locals()
-    def make_node(self, inp, kth):
-        ctx_name = infer_context_name(inp)
-        inp = as_gpuarray_variable(inp, ctx_name)
-        kth = as_tensor_variable(kth)
-        bcast = inp.type.broadcastable
-        outs = []
-        if self.return_values:
-            outs.append(inp.type())
-        if self.return_indices:
-            outs.append(
-                GpuArrayType(
-                    dtype=self.idx_dtype, broadcastable=bcast, context_name=ctx_name
-                )()
-            )
-        return Apply(self, [inp, kth], outs)
-    def get_params(self, node):
-        return node.inputs[0].type.context
-class ValuesEqApproxNoOrder:
-    """
-    We ignore the order of elements on a given axis during the comparison.
-    """
-    def __init__(self, axis):
-        self.axis = axis
-    def __call__(self, val1, val2):
-        v1 = np.sort(val1, axis=self.axis)
-        v2 = np.sort(val2, axis=self.axis)
-        ret = aesara.tensor.type.values_eq_approx(v1, v2)
-        return ret
-@register_opt("fast_compile")
-@op_lifter([TopKOp], cuda_only=True)
-@register_opt2([TopKOp], "fast_compile")
-def local_gpua_topkop(op, ctx_name, inputs, outputs):
-    axis = op.axis
-    rv = op.return_values
-    ri = op.return_indices
-    x, k = inputs
-    x = as_gpuarray_variable(x, ctx_name)
-    if op.sorted:
-        return
-    gpu_op = GpuTopKOp(
-        axis=axis,
-        sorted=op.sorted,
-        idx_dtype=op.idx_dtype,
-        return_values=rv,
-        return_indices=ri,
-    )
-    rets = gpu_op(x, k, return_list=True)
-    c = ValuesEqApproxNoOrder(axis)
-    for r in rets:
-        r.tag.values_eq_approx = c
-    return rets
--- a/aesara/gpuarray/subtensor.py
+++ b/aesara/gpuarray/subtensor.py
-from io import StringIO
-import numpy as np
-import aesara.tensor as at
-from aesara.gradient import grad_not_implemented
-from aesara.graph.basic import Apply
-from aesara.graph.op import Op
-from aesara.link.c.interface import HideC
-from aesara.link.c.op import COp
-from aesara.link.c.params_type import ParamsType
-from aesara.link.c.type import CType
-from aesara.scalar import bool as bool_t
-from aesara.scalar import int32 as int_t
-from aesara.scalar import uint32 as size_t
-from aesara.tensor.basic import AllocDiag
-from aesara.tensor.math import clip, minimum
-from aesara.tensor.subtensor import (
-    AdvancedIncSubtensor,
-    AdvancedSubtensor,
-    AdvancedSubtensor1,
-    IncSubtensor,
-    Subtensor,
-    get_idx_list,
-)
-from aesara.tensor.type import integer_dtypes
-try:
-    import pygpu
-    from pygpu import gpuarray
-except ImportError:
-    pass
-from aesara.gpuarray.basic_ops import (
-    GpuKernelBase,
-    Kernel,
-    as_gpuarray_variable,
-    gpu_contiguous,
-    gpuarray_helper_inc_dir,
-    infer_context_name,
-)
-from aesara.gpuarray.type import GpuArrayType, gpu_context_type
-iadd_reg = {}
-def get_iadd(a, b):
-    key = (a.type.dtype, b.type.dtype, a.type.context)
-    if key not in iadd_reg:
-        a_arg = pygpu.elemwise.arg("a", a.type.dtype, read=True, write=True)
-        b_arg = pygpu.elemwise.arg("b", b.type.dtype, read=True)
-        res = pygpu.elemwise.GpuElemwise(
-            a.type.context, "a = a + b", [a_arg, b_arg], convert_f16=True
-        )
-        iadd_reg[key] = res
-    return iadd_reg[key]
-class GpuSubtensor(HideC, Subtensor):
-    """
-    Subtensor on the GPU.
-    """
-    _f16_ok = True
-    def make_node(self, x, *inputs):
-        ctx_name = infer_context_name(x)
-        rval = Subtensor.make_node(self, x, *inputs)
-        otype = GpuArrayType(
-            dtype=rval.outputs[0].type.dtype,
-            broadcastable=rval.outputs[0].type.broadcastable,
-            context_name=ctx_name,
-        )
-        x = as_gpuarray_variable(x, ctx_name)
-        return Apply(self, [x] + rval.inputs[1:], [otype()])
-    def perform(self, node, inputs, out_):
-        (out,) = out_
-        x = inputs[0]
-        cdata = get_idx_list(inputs, self.idx_list)
-        if len(cdata) == 1:
-            cdata = cdata[0]
-        out[0] = x.__getitem__(cdata)
-    def c_support_code(self, **kwargs):
-        return """
-        static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
-                               int start_n, int stop_n, int step_n,
-                               size_t len) {
-            if (step_n) *step = 1;
-            if (*step == 0) {
-                PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
-                return -1;
-            }
-            if (start_n) *start = (*step < 0) ? len-1 : 0;
-            else {
-                if (*start < 0) *start += len;
-                if (*start < 0) *start = (*step < 0) ? -1 : 0;
-                if (*start > -1 && *start >= len) {
-                    *start = (*step < 0) ? len-1 : len;
-                }
-            }
-            if (stop_n) *stop = (*step < 0) ? -1 : len;
-            else {
-                if (*stop < 0) *stop += len;
-                if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
-                if (*stop > -1 && *stop >= len) {
-                    *stop = (*step < 0) ? len-1 : len;
-                }
-            }
-            if (*stop < *start && *step > 0)
-                *stop = *start;
-            return 0;
-        }
-        """
-    def c_code(self, node, name, inputs, outputs, sub):
-        inp_ndim = node.inputs[0].ndim
-        inp = inputs[0]
-        indices = inputs[1:]
-        # pad out the index list to the same dimension as the input
-        idx_list = self.idx_list + ((slice(None),) * (inp_ndim - len(self.idx_list)))
-        # This case fails when we use pygpu_index(), so here is some
-        # special code
-        if len(idx_list) == 0:
-            return """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
-        if (!%(out)s) {
-            // Exception already set
-            %(fail)s
-        }
-""" % dict(
-                out=outputs[0], inp=inp, fail=sub["fail"]
-            )
-        sio = StringIO()
-        print(
-            """
-        ssize_t starts[%(sz)s];
-        ssize_t stops[%(sz)s];
-        ssize_t steps[%(sz)s];
-        ssize_t cur;
-        int err;
-        if (%(inp)s->ga.nd != %(sz)s) {
-            PyErr_SetString(PyExc_IndexError, "invalid index");
-            %(fail)s
-        }
-        """
-            % dict(sz=len(idx_list), inp=inp, fail=sub["fail"]),
-            file=sio,
-        )
-        def fix_idx(idx):
-            if idx is None:
-                return "0", 1
-            elif isinstance(idx, (np.integer, int)):
-                return str(idx), 0
-            elif isinstance(idx, CType):
-                return indices.pop(0), 0
-            else:
-                assert 0, idx
-        for i, idx in enumerate(idx_list):
-            if isinstance(idx, slice):
-                start, start_n = fix_idx(idx.start)
-                stop, stop_n = fix_idx(idx.stop)
-                step, step_n = fix_idx(idx.step)
-                print(
-                    """
-                starts[%(i)s] = %(start)s;
-                stops[%(i)s] = %(stop)s;
-                steps[%(i)s] = %(step)s;
-                if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
-                                %(start_n)s, %(stop_n)s, %(step_n)s,
-                                %(inp)s->ga.dimensions[%(i)s]) == -1) {
-                    %(fail)s
-                }
-                """
-                    % dict(
-                        i=i,
-                        start=start,
-                        stop=stop,
-                        step=step,
-                        start_n=start_n,
-                        stop_n=stop_n,
-                        step_n=step_n,
-                        fail=sub["fail"],
-                        inp=inp,
-                    ),
-                    file=sio,
-                )
-            else:
-                if isinstance(idx, CType):
-                    start = indices.pop(0)
-                elif isinstance(idx, (np.integer, int)):
-                    start = idx
-                else:
-                    assert 0, idx
-                print(
-                    """
-                cur = %(start)s;
-                if (cur < 0)
-                    cur += %(inp)s->ga.dimensions[%(i)s];
-                starts[%(i)s] = cur;
-                steps[%(i)s] = 0;
-                """
-                    % dict(i=i, start=start, fail=sub["fail"], inp=inp),
-                    file=sio,
-                )
-        print(
-            """
-        Py_XDECREF(%(out)s);
-        %(out)s = pygpu_index(%(inp)s, starts, stops, steps);
-        if (!%(out)s) { %(fail)s }
-"""
-            % dict(name=name, fail=sub["fail"], inp=inp, out=outputs[0]),
-            file=sio,
-        )
-        return sio.getvalue()
-    def c_code_cache_version(self):
-        return (8,)
-class GpuIncSubtensor(IncSubtensor):
-    """
-    Implement IncSubtensor on the gpu.
-    Notes
-    -----
-    The optimization to make this inplace is in tensor/opt.
-    The same optimization handles IncSubtensor and GpuIncSubtensor.
-    This Op has c_code too; it inherits IncSubtensor's c_code.
-    The helper methods like :meth:`do_type_checking`,
-    :meth:`copy_of_x`, etc. specialize the c_code for this Op.
-    """
-    _f16_ok = True
-    params_type = gpu_context_type
-    def make_node(self, x, y, *inputs):
-        ctx_name = infer_context_name(x, y)
-        x = as_gpuarray_variable(x, ctx_name)
-        y = as_gpuarray_variable(y, ctx_name)
-        rval = IncSubtensor.make_node(self, x, y, *inputs)
-        ret = Apply(self, [x, y] + rval.inputs[2:], [x.type()])
-        return ret
-    def get_params(self, node):
-        return node.outputs[0].type.context
-    def perform(self, node, inputs, out_, ctx):
-        (out,) = out_
-        x, y = inputs[:2]
-        indices = list(reversed(inputs[2:]))
-        def convert(entry):
-            if isinstance(entry, CType):
-                rval = indices.pop()
-                return rval
-            elif isinstance(entry, slice):
-                return slice(
-                    convert(entry.start), convert(entry.stop), convert(entry.step)
-                )
-            else:
-                return entry
-        cdata = tuple(map(convert, self.idx_list))
-        if len(cdata) == 1:
-            cdata = cdata[0]
-        if not self.inplace:
-            x = x.copy()
-        sub_x = x.__getitem__(cdata)
-        if sub_x.shape:
-            # we've sliced out an N-D tensor with N > 0
-            if not self.set_instead_of_inc:
-                # sub_x += y
-                iadd = get_iadd(node.inputs[0], node.inputs[1])
-                iadd(sub_x, y)
-            else:
-                # sub_x[...] = y
-                x.__setitem__(cdata, y)
-        else:
-            # scalar case
-            if not self.set_instead_of_inc:
-                # x.__setitem__(cdata, sub_x + y)
-                tmp = pygpu.elemwise.elemwise2(sub_x, "+", y, sub_x, broadcast=False)
-                x.__setitem__(cdata, tmp)
-            else:
-                x.__setitem__(cdata, y)
-        out[0] = x
-    def do_type_checking(self, node):
-        """
-        Should raise NotImplementedError if c_code does not support
-        the types involved in this node.
-        """
-        if not isinstance(node.inputs[0].type, GpuArrayType):
-            raise NotImplementedError()
-    def copy_of_x(self, x):
-        """
-        Parameters
-        ----------
-        x
-            A string giving the name of a C variable pointing to an array.
-        Returns
-        -------
-        str
-            C code expression to make a copy of x.
-        Notes
-        -----
-        Base class uses `PyArrayObject *`, subclasses may override for
-        different types of arrays.
-        """
-        return f"""pygpu_copy({x}, GA_ANY_ORDER)"""
-    def decl_view(self):
-        return "PyGpuArrayObject* zview = NULL;"
-    def make_view_array(self, x, view_ndim):
-        """
-        //TODO
-        Parameters
-        ----------
-        x
-            A string identifying an array to be viewed.
-        view_ndim
-            A string specifying the number of dimensions to have in the view.
-            This doesn't need to actually set up the view with the
-            right indexing; we'll do that manually later.
-        """
-        ret = f"""
-        size_t dims[{view_ndim}];
-        for(int i=0; i<{view_ndim}; i++)
-            dims[i] = xview_dims[i];
-        zview = pygpu_fromgpudata({x}->ga.data,
-                                  {x}->ga.offset + xview_offset,
-                                  {x}->ga.typecode,
-                                  {view_ndim},
-                                  dims,
-                                  xview_strides,
-                                  {x}->context,
-                                  1,
-                                  (PyObject *){x},
-                                  (PyObject *)&PyGpuArrayType);
-        """
-        return ret
-    def get_helper_c_code_args(self):
-        """
-        Return a dictionary of arguments to use with helper_c_code.
-        """
-        return {"c_prefix": "PyGpuArray", "strides_mul": 1}
-    def copy_into(self, view, source):
-        """
-        Parameters
-        ----------
-        view : string
-            C code expression for an array.
-        source : string
-            C code expression for an array.
-        Returns
-        -------
-        str
-            C code expression to copy source into view, and 0 on success.
-        """
-        return f"""sub_setarray(&{view}->ga, &{source}->ga)"""
-    def c_headers(self, **kwargs):
-        return [
-            "<numpy_compat.h>",
-            "<gpuarray/error.h>",
-            "<gpuarray/array.h>",
-            "<gpuarray/elemwise.h>",
-        ]
-    def c_support_code(self, **kwargs):
-        return """
-int sub_setarray(GpuArray *dst, GpuArray *src) {
-  int err;
-  err = GpuArray_setarray(dst, src);
-  if (err != GA_NO_ERROR)
-    PyErr_SetString(PyExc_RuntimeError, GpuArray_error(src, err));
-  return err;
-}
-"""
-    def c_support_code_struct(self, node, nodename):
-        return "\nGpuElemwise *iadd;\n"
-    def c_init_code_struct(self, node, name, sub):
-        return """
-        gpuelemwise_arg args[2] = {{0}};
-        args[0].name = "a";
-        args[0].typecode = %(type1)s;
-        args[0].flags = GE_READ|GE_WRITE;
-        args[1].name = "b";
-        args[1].typecode = %(type2)s;
-        args[1].flags = GE_READ;
-        iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
-                               2, args, %(nd)s, GE_CONVERT_F16);
-        if (iadd == NULL) {
-          PyErr_SetString(PyExc_RuntimeError, "Could not initialize inplace add support");
-          %(fail)s
-        }
-        """ % dict(
-            ctx=sub["params"],
-            fail=sub["fail"],
-            type1=node.inputs[0].type.typecode,
-            type2=node.inputs[1].type.typecode,
-            nd=node.inputs[1].ndim,
-        )
-    def add_to_zview(self, nodename, x, fail):
-        return (
-            """
-        {
-          void *args[2];
-          args[0] = &zview->ga;
-          args[1] = &%(x)s->ga;
-          if (GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE) != GA_NO_ERROR) {
-            PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add");
-            Py_DECREF(zview);
-            %(fail)s
-          }
-        }
-        """
-            % locals()
-        )
-    def c_code_cache_version(self):
-        parent_version = super().c_code_cache_version()
-        if not parent_version:
-            return
-        return parent_version + (10,)
-class GpuAdvancedSubtensor1(HideC, AdvancedSubtensor1):
-    """
-    AdvancedSubrensor1 on the GPU.
-    """
-    _f16_ok = True
-    def make_node(self, x, ilist):
-        ctx_name = infer_context_name(x, ilist)
-        x_ = as_gpuarray_variable(x, ctx_name)
-        ilist__ = at.as_tensor_variable(ilist)
-        if ilist__.type.dtype not in integer_dtypes:
-            raise TypeError("index must be integers")
-        if ilist__.type.dtype != "int64":
-            ilist__ = at.cast(ilist__, "int64")
-        ilist_ = gpu_contiguous(as_gpuarray_variable(ilist__, ctx_name))
-        if ilist_.type.dtype != "int64":
-            raise TypeError("index must be int64")
-        if ilist_.type.ndim != 1:
-            raise TypeError("index must be a vector")
-        if x_.type.ndim == 0:
-            raise TypeError("cannot index into a scalar")
-        bcast = ilist_.broadcastable + x_.broadcastable[1:]
-        return Apply(
-            self,
-            [x_, ilist_],
-            [GpuArrayType(dtype=x.dtype, context_name=ctx_name, broadcastable=bcast)()],
-        )
-    def perform(self, node, inp, out_):
-        raise NotImplementedError()
-    def c_support_code(self, **kwargs):
-        return """
-int take1_match_dims(GpuArray *a, GpuArray *v) {
-  if (a->nd != v->nd) return 0;
-  for (unsigned int i = 1; i < v->nd; i++) {
-    if (a->dimensions[i] != v->dimensions[i]) return 0;
-  }
-  return 1;
-}
-"""
-    def c_code(self, node, name, inputs, outputs, sub):
-        return """
-int err;
-if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
-    %(out)s->ga.dimensions[0] != %(idx)s->ga.dimensions[0] ||
-    !take1_match_dims(&%(out)s->ga, &%(v)s->ga)) {
-  size_t tmp;
-  Py_XDECREF(%(out)s);
-  /* This is a dirty hack to avoid an extra alloc */
-  tmp = %(v)s->ga.dimensions[0];
-  %(v)s->ga.dimensions[0] = %(idx)s->ga.dimensions[0];
-  %(out)s = pygpu_empty(%(v)s->ga.nd, %(v)s->ga.dimensions, %(v)s->ga.typecode,
-                        GA_C_ORDER, %(v)s->context, Py_None);
-  if (%(out)s == NULL) {
-    %(fail)s;
-  }
-  %(v)s->ga.dimensions[0] = tmp; // Don't remove this line
-}
-err = GpuArray_take1(&%(out)s->ga, &%(v)s->ga, &%(idx)s->ga, 1);
-if (err != GA_NO_ERROR) {
-  if (err == GA_VALUE_ERROR) {
-    PyErr_SetString(PyExc_IndexError, "Index out of bounds.");
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, GpuArray_error(&%(v)s->ga, err));
-  }
-  %(fail)s
-}
-""" % dict(
-            out=outputs[0], v=inputs[0], idx=inputs[1], fail=sub["fail"]
-        )
-    def c_code_cache_version(self):
-        return (1,)
-def check_and_convert_boolean_masks(input, idx_list):
-    """
-    This function checks if the boolean mask arrays in the index have
-    the right shape and converts them to index arrays by calling nonzero.
-    For each boolean mask, we check if the mask has the
-    same shape as the input. This is enforced in NumPy 0.13.0 and
-    newer, but not by earlier versions. If the size is not the same,
-    this method raises an IndexError.
-    """
-    dim_seen = 0
-    out_idx_list = []
-    for index in idx_list:
-        if index is np.newaxis:
-            # skip, does not count as an input dimension
-            out_idx_list.append(index)
-        elif isinstance(index, np.ndarray) and index.dtype == "bool":
-            for i in range(index.ndim):
-                if index.shape[i] != input.shape[dim_seen + i]:
-                    raise IndexError(
-                        "boolean index did not match indexed array "
-                        f"along dimension {int(dim_seen + i)}; dimension is {int(input.shape[dim_seen + i])} but "
-                        f"corresponding boolean dimension is {int(index.shape[i])}"
-                    )
-            dim_seen += index.ndim
-            out_idx_list += index.nonzero()
-        else:
-            dim_seen += 1
-            out_idx_list.append(index)
-    return out_idx_list
-class BaseGpuAdvancedSubtensor:
-    def perform(self, node, inputs, out_):
-        (out,) = out_
-        x = inputs[0]
-        idx = inputs[1:]
-        # convert boolean masks to index arrays
-        idx = check_and_convert_boolean_masks(x, idx)
-        # detect and transpose array indices
-        nidx = []
-        nshp = list(x.shape)
-        for k, i in enumerate(idx):
-            if i is None:
-                nidx.append(slice(None))
-                nshp.insert(k, 1)
-            else:
-                nidx.append(i)
-        x = x.reshape(nshp)
-        transp = list(range(x.ndim))
-        # number of array-indexed dimensions
-        p = 0
-        # ap represents the axis in the resulting array where the
-        # dimensions indexed by arrays and ints will be inserted.
-        # For instance, if all such dimensions are grouped together,
-        # it corresponds to the index of the first such dimension in the
-        # initial array.  If these dimensions are split (with slices
-        # between), then the resulting dimensions will be moved to the
-        # beginning, and ap will be 0.
-        # If no such dimension has been encountered, ap is None.
-        ap = None
-        # Indicates whether we have already encountered an index (array
-        # or number), and then a slice.
-        slice_after_idx = False
-        for k, i in enumerate(list(nidx)):
-            if isinstance(i, np.ndarray) and i.ndim != 0:
-                transp.remove(k)
-                transp.insert(p, k)
-                i = nidx.pop(k)
-                nidx.insert(p, i)
-                p += 1
-                if ap is None:
-                    # first non-slice index
-                    ap = k
-                elif slice_after_idx:
-                    # We already encountered at least an array or int, and then
-                    # a slice. Array-indexed axes are not grouped,
-                    # moving to the beginning
-                    ap = 0
-            else:
-                try:
-                    i.__index__()
-                    if ap is None:
-                        ap = k
-                    # indices do not break the contiguity of
-                    # array-indexed axes
-                except Exception:
-                    # If we already encountered an array/int index, it
-                    # means future ones will not be grouped.
-                    if ap is not None:
-                        slice_after_idx = True
-        x = x.transpose(*transp)
-        idx_ = [slice(None)] * p + nidx[p:]
-        x = x.__getitem__(idx_)
-        if p == 0:
-            assert ap is None
-            # The only indexing was through slices and indices.
-            # This can happen with symbolic slices for instance.
-            # Since no view_map is set, we need to copy the returned value
-            out[0] = x.copy()
-            return
-        # At this point, we should have encountered at least one array
-        assert ap is not None
-        # flatten the array-indexed dimensions
-        shape = (np.prod(x.shape[0:p]),) + x.shape[p:]
-        input_flat = x.reshape(shape)
-        # build the strides
-        strides = [1]
-        for i in range(p - 1, 0, -1):
-            stride = x.shape[i] * strides[0]
-            strides.insert(0, stride)
-        # build the indices and use it
-        take_idx = sum((i * s for i, s in zip(nidx, strides)))
-        out_flat = input_flat.take1(
-            pygpu.asarray(take_idx.flatten(), context=x.context)
-        )
-        # finish up
-        out_flat_shp = take_idx.shape + x.shape[p:]
-        o = out_flat.reshape(out_flat_shp)
-        if ap != 0:
-            # Put the resulting indexing at the place that NumPy
-            # decided was the right one.
-            ntransp = list(range(take_idx.ndim, o.ndim))
-            ntransp[ap:ap] = list(range(take_idx.ndim))
-            o = o.transpose(*ntransp)
-        out[0] = o
-class GpuAdvancedSubtensor(HideC, BaseGpuAdvancedSubtensor, AdvancedSubtensor):
-    """
-    AdvancedSubtensor on the GPU.
-    """
-    def make_node(self, x, *inputs):
-        ctx_name = infer_context_name(x)
-        rval = AdvancedSubtensor.make_node(self, x, *inputs)
-        otype = GpuArrayType(
-            dtype=rval.outputs[0].type.dtype,
-            broadcastable=rval.outputs[0].type.broadcastable,
-            context_name=ctx_name,
-        )
-        x = as_gpuarray_variable(x, ctx_name)
-        return Apply(self, [x] + rval.inputs[1:], [otype()])
-class BaseGpuAdvancedIncSubtensor:
-    def perform(self, node, inp, out_):
-        (out,) = out_
-        x = inp[0]
-        y = inp[1]
-        idx = inp[2:]
-        x = x.copy()
-        # Get a handle to the GpuElemwise object that will be called.
-        # It is not necessary to have the right number of dimensions,
-        # so we just pass symbolic x and y.
-        iadd = get_iadd(node.inputs[0], node.inputs[1])
-        # convert all indices to np.array
-        for i in range(len(idx)):
-            if isinstance(idx[i], gpuarray.GpuArray):
-                idx[i] = np.asarray(idx[i])
-        # convert boolean masks to index arrays
-        idx = check_and_convert_boolean_masks(x, idx)
-        # Insert axes for None indexing
-        nidx = []
-        nshp = list(x.shape)
-        for k, i in enumerate(idx):
-            if i is None:
-                nidx.append(slice(None))
-                nshp.insert(k, 1)
-            else:
-                nidx.append(i)
-        x_ = x.reshape(nshp)
-        # Bring array indices to front
-        transp = []
-        nidx_ = []
-        p = 0
-        for k, i in enumerate(list(nidx)):
-            if isinstance(i, np.ndarray) and i.ndim != 0:
-                transp.append(k)
-                nidx_.append(i)
-                p += 1
-        for k, i in enumerate(list(nidx)):
-            if not (isinstance(i, np.ndarray) and i.ndim != 0):
-                transp.append(k)
-                nidx_.append(i)
-        transp = transp + list(range(len(transp), x_.ndim))
-        rtransp = [i for i, _ in sorted(enumerate(transp), key=lambda x: x[1])]
-        nidx = nidx_
-        # transp: order to shuffle axes of x so that single dimension
-        #         subarrays are extracted first
-        # p: number of axes with array indexing
-        x_ = x_.transpose(*transp)
-        idx_ = [slice(None)] * p + nidx[p:]
-        # flatten the array-indexed dimensions
-        x_flat = x_.reshape((np.prod(x_.shape[0:p]),) + x_.shape[p:])
-        # process y so that last axes are the same
-        if y.shape != (1,):
-            y_shape_reverse = []
-            for x_s, y_s in zip(x_flat.shape[::-1], y.shape[::-1]):
-                if x_s == y_s or y_s == 1:
-                    y_shape_reverse.append(y_s)
-                else:
-                    break
-            if np.prod(y_shape_reverse) < np.prod(y.shape):
-                if len(y_shape_reverse) > 0:
-                    y_shape_reverse.append(
-                        int(np.prod(y.shape[0 : -len(y_shape_reverse)]))
-                    )
-                else:
-                    y_shape_reverse.append(int(np.prod(y.shape)))
-            y_shape = y_shape_reverse[::-1]
-            y_flat = y.reshape(y_shape)
-        else:
-            y_flat = y[0]
-        # build the strides
-        strides = [1]
-        for i in range(p - 1, 0, -1):
-            stride = x_.shape[i] * strides[0]
-            strides.insert(0, stride)
-        # build the indices and use it
-        index = idx_[p:] + [slice(None)] * (len(x_flat.shape) - len(idx_[p:]) - 1)
-        take_idx = sum(i * s for i, s in zip(nidx, strides))
-        if index == []:
-            for j, i in enumerate(take_idx.flatten()):
-                if y_flat.shape == ():
-                    val = y_flat
-                else:
-                    val = y_flat[j]
-                iadd(x_flat[i], val, broadcast=True)
-        else:
-            if x_flat.shape[-len(y_flat.shape) :] == y_flat.shape or y_flat.shape == ():
-                # y_flat has to be broadcast over axes of x_flat[i]
-                for i in take_idx.flatten():
-                    if len(idx_[p:]) > 0:
-                        x_flat_sub = x_flat[i].__getitem__(index)
-                    else:
-                        x_flat_sub = x_flat[i]
-                    iadd(x_flat_sub, y_flat, broadcast=True)
-            else:
-                # y_flat's first axis corresponds to first exist of x_flat
-                for j, i in enumerate(take_idx.flatten()):
-                    if len(idx_[p:]) > 0:
-                        x_flat_sub = x_flat[i].__getitem__(index)
-                    else:
-                        x_flat_sub = x_flat[i]
-                    iadd(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
-        x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
-        out[0] = x_
-class GpuAdvancedIncSubtensor(HideC, BaseGpuAdvancedIncSubtensor, AdvancedIncSubtensor):
-    """
-    Implement AdvancedIncSubtensor on the gpu.
-    """
-    def make_node(self, x, y, *inputs):
-        ctx_name = infer_context_name(x, y)
-        rval = AdvancedIncSubtensor.make_node(self, x, y, *inputs)
-        otype = GpuArrayType(
-            dtype=rval.outputs[0].type.dtype,
-            broadcastable=rval.outputs[0].type.broadcastable,
-            context_name=ctx_name,
-        )
-        x = as_gpuarray_variable(x, ctx_name)
-        y = as_gpuarray_variable(y, ctx_name)
-        return Apply(self, [x, y] + rval.inputs[2:], [otype()])
-class GpuAdvancedIncSubtensor1(COp):
-    """
-    Implement AdvancedIncSubtensor1 on the gpu.
-    """
-    _f16_ok = True
-    __props__ = ("inplace", "set_instead_of_inc")
-    params_type = ParamsType(
-        inplace=bool_t,
-        set_instead_of_inc=bool_t,
-        context=gpu_context_type,
-        # following params are used into c_init_code_struct(),
-        # as inputs are not available in that function.
-        ndim_input_0=size_t,
-        ndim_input_1=size_t,
-        typecode_input_0=int_t,
-        typecode_input_1=int_t,
-    )
-    def __init__(self, inplace=False, set_instead_of_inc=False):
-        self.inplace = inplace
-        self.set_instead_of_inc = set_instead_of_inc
-        if inplace:
-            self.destroy_map = {0: [0]}
-    def clone_inplace(self):
-        return self.__class__(inplace=True, set_instead_of_inc=self.set_instead_of_inc)
-    def make_node(self, x, y, ilist):
-        ctx_name = infer_context_name(x, y)
-        x_ = as_gpuarray_variable(x, ctx_name)
-        y_ = as_gpuarray_variable(y, ctx_name)
-        ilist_ = at.as_tensor_variable(ilist)
-        assert x_.type.ndim >= y_.type.ndim
-        if ilist_.type.dtype not in integer_dtypes:
-            raise TypeError("index must be integers")
-        if ilist_.type.ndim != 1:
-            raise TypeError("index must be vector")
-        if x_.type.ndim == 0:
-            raise TypeError("cannot index into a scalar")
-        if y_.type.ndim > x_.type.ndim:
-            if self.set_instead_of_inc:
-                opname = "set"
-            else:
-                opname = "increment"
-            raise TypeError(
-                "cannot %s x subtensor with ndim=%s by y with ndim=%s "
-                % (opname, x_.type.ndim, y_.type.ndim)
-            )
-        return Apply(self, [x_, y_, ilist_], [x_.type()])
-    def get_params(self, node):
-        return self.params_type.get_params(
-            self,
-            context=node.outputs[0].type.context,
-            # following params are used into c_init_code_struct().
-            ndim_input_0=node.inputs[0].ndim,
-            ndim_input_1=node.inputs[1].ndim,
-            typecode_input_0=node.inputs[0].type.typecode,
-            typecode_input_1=node.inputs[1].type.typecode,
-        )
-    # We can't use the parent version that loops on each index
-    # as we also need to loop when set_instead_of_inc is True and the
-    # parent doesn't loop in that case.
-    def perform(self, node, inp, out_, params=None):
-        # TODO opt to make this inplace
-        x, y, idx = inp
-        (out,) = out_
-        if not self.inplace:
-            x = x.copy()
-        out[0] = x
-        if len(idx) == 0:
-            return
-        # Make sure idx is not a GpuArray otherwise we cannot use its
-        # content to index x and y (This is because we serve as
-        # fallback for _dev20).
-        if isinstance(idx, gpuarray.GpuArray):
-            idx = np.asarray(idx)
-        # If `y` has as many dimensions as `x`, then we want to iterate
-        # jointly on `x` and `y`. Otherwise, it means `y` should be
-        # broadcasted to fill all relevant rows of `x`.
-        if y.ndim == x.ndim and y.shape[0] != 1:
-            assert len(y) == len(idx)
-            if self.set_instead_of_inc:
-                for (j, i) in enumerate(idx):
-                    x[i] = y[j]
-            else:
-                k = get_iadd(node.inputs[0], node.inputs[1])
-                for (j, i) in enumerate(idx):
-                    k(x[i], y[j], broadcast=True)
-        else:
-            if y.ndim == x.ndim:
-                # First dim is always 1 in this case.
-                reshaped_y = y.reshape(y.shape[1:])
-            else:
-                nb_dims_to_add = (x.ndim - 1) - y.ndim
-                reshaped_y = y.reshape((1,) * nb_dims_to_add + y.shape)
-            if self.set_instead_of_inc:
-                for i in idx:
-                    x[i] = reshaped_y
-            else:
-                k = get_iadd(node.inputs[0], node.inputs[1])
-                for i in idx:
-                    k(x[i], reshaped_y, broadcast=True)
-    def c_headers(self, **kwargs):
-        return [
-            "<numpy_compat.h>",
-            "<gpuarray/error.h>",
-            "<gpuarray/array.h>",
-            "<gpuarray/elemwise.h>",
-            "gpuarray_helper.h",
-        ]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_support_code_struct(self, node, nodename):
-        return "\nGpuElemwise *iadd;\n"
-    def c_init_code_struct(self, node, name, sub):
-        return """
-        gpuelemwise_arg args[2] = {{0}};
-        args[0].name = "a";
-        args[0].typecode = %(params)s->typecode_input_0;
-        args[0].flags = GE_READ|GE_WRITE;
-        args[1].name = "b";
-        args[1].typecode = %(params)s->typecode_input_1;
-        args[1].flags = GE_READ;
-        iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
-                               2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
-        if (iadd == NULL) {
-          PyErr_SetString(PyExc_RuntimeError, "Could not initialize inplace add support");
-          %(fail)s
-        }
-        """ % dict(
-            params=sub["params"], fail=sub["fail"]
-        )
-    def c_code(self, node, name, inputs, outputs, sub):
-        if node.inputs[0].ndim != node.inputs[1].ndim:
-            raise NotImplementedError("This case does not have C code yet.")
-        return """
-        PyGpuArrayObject *row_x, *row_y;
-        size_t nd = %(params)s->ndim_input_0;
-        ssize_t *start = NULL, *step = NULL;
-        size_t num_indices, j;
-        int ret;
-        int broadcast_y;
-        start = (ssize_t*)malloc(nd * sizeof(ssize_t));
-        step = (ssize_t*)malloc(nd * sizeof(ssize_t));
-        if (start == NULL || step == NULL) {
-            PyErr_NoMemory();
-            %(fail)s
-        }
-        for (j = 0; j < nd; ++j) {
-          start[j] = 0;
-          step[j] = 1;
-        }
-        step[0] = 0;
-        num_indices = PyArray_SIZE(%(ind)s);
-        if (!%(params)s->inplace) {
-          %(out)s = aesara_try_copy(%(out)s, %(x)s);
-          if (%(out)s == NULL) {
-            // Exception already set
-            %(fail)s
-            }
-        } else {
-          Py_XDECREF(%(out)s);
-          %(out)s = %(x)s;
-          Py_INCREF(%(out)s);
-        }
-        if (num_indices != 0) {
-          if ((num_indices - 1) > LONG_MAX) {
-            PyErr_Format(PyExc_AssertionError,
-                         "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
-            %(fail)s
-          }
-          broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
-          for (j = 0; j < num_indices; j++) {
-            start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
-            if (start[0] < 0)
-              start[0] += PyGpuArray_DIM(%(out)s, 0);
-            if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) {
-               PyErr_SetString(PyExc_IndexError, "index out of bounds");
-               %(fail)s;
-            }
-            row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step);
-            if (row_x == NULL)
-              %(fail)s;
-            if (broadcast_y)
-              start[0] = 0;
-            else
-              start[0] = j;
-            row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step);
-            if (row_y == NULL) {
-              Py_DECREF(row_x);
-              %(fail)s;
-            }
-            if (%(params)s->set_instead_of_inc) {
-              ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
-            } else {
-              void *args[2];
-              args[0] = (void *)&row_x->ga;
-              args[1] = (void *)&row_y->ga;
-              ret = GpuElemwise_call(iadd, args, GE_BROADCAST | GE_PADSHAPE);
-            }
-            Py_DECREF(row_x);
-            Py_DECREF(row_y);
-            if (ret != GA_NO_ERROR)
-              PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
-          }
-        }
-        free(start);
-        free(step);
-        """ % dict(
-            x=inputs[0],
-            y=inputs[1],
-            ind=inputs[2],
-            out=outputs[0],
-            params=sub["params"],
-            fail="""
-                   {
-                        free(start);
-                        free(step);
-                        %(fail)s
-                   }
-                   """
-            % dict(fail=sub["fail"]),
-        )
-    def c_code_cache_version(self):
-        return (5,)
-class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, GpuAdvancedIncSubtensor1):
-    """
-    Implement AdvancedIncSubtensor1 on the gpu with atomics
-    """
-    _f16_ok = True
-    params_type = GpuAdvancedIncSubtensor1.params_type
-    get_params = GpuAdvancedIncSubtensor1.get_params
-    def make_node(self, x, y, ilist):
-        """
-        It differs from GpuAdvancedIncSubtensor1 in that it makes sure
-        the indexes are of type long.
-        """
-        ctx_name = infer_context_name(x, y, ilist)
-        x_ = as_gpuarray_variable(x, ctx_name)
-        y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name)
-        ilist_ = as_gpuarray_variable(ilist, ctx_name)
-        assert x_.type.ndim >= y_.type.ndim
-        if ilist_.type.dtype not in integer_dtypes:
-            raise TypeError("index must be integers")
-        if ilist_.type.ndim != 1:
-            raise TypeError("index must be vector")
-        if x_.type.ndim == 0:
-            raise TypeError("cannot index into a scalar")
-        if y_.type.ndim > x_.type.ndim:
-            if self.set_instead_of_inc:
-                opname = "set"
-            else:
-                opname = "increment"
-            raise TypeError(
-                "cannot %s x subtensor with ndim=%s by y with ndim=%s "
-                % (opname, x_.type.ndim, y_.type.ndim)
-            )
-        return Apply(self, [x_, y_, ilist_], [x_.type()])
-    def perform(self, node, inp, out, params):
-        return super().perform(node, inp, out)
-    def c_code_cache_version(self):
-        return (14,)
-    def c_headers(self, **kwargs):
-        return ["<numpy_compat.h>", "<gpuarray_helper.h>", "<gpuarray/types.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [gpuarray_helper_inc_dir()]
-    def c_code(self, node, name, inputs, outputs, sub):
-        if node.inputs[0].ndim != node.inputs[1].ndim or node.inputs[0].ndim != 2:
-            raise NotImplementedError("This case does not have C code yet.")
-        return """
-int err;
-if (%(params)s->inplace) {
-  Py_XDECREF(%(out)s);
-  %(out)s = %(x)s;
-  Py_INCREF(%(out)s);
-} else {
-  %(out)s = aesara_try_copy(%(out)s, %(x)s);
-}
-if (!%(out)s) {
-  // Exception already set
-  %(fail)s
-}
-if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
-  %(fail)s
-}
-        """ % dict(
-            x=inputs[0],
-            y=inputs[1],
-            ind=inputs[2],
-            out=outputs[0],
-            fail=sub["fail"],
-            params=sub["params"],
-        )
-    def gpu_kernels(self, node, nodename):
-        # We can't rely on numpy for this, it changes with the OS
-        CHARMAP = dict(
-            int32="i",
-            uint32="I",
-            int64="l",
-            uint64="L",
-            float16="e",
-            float32="f",
-            float64="d",
-        )
-        dtype_x = node.inputs[0].dtype
-        dtype_y = node.inputs[1].dtype
-        dtype_ind = node.inputs[2].dtype
-        type_x = gpuarray.dtype_to_ctype(dtype_x)
-        type_y = gpuarray.dtype_to_ctype(dtype_y)
-        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
-        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
-        kname = "k_vector_add_fast"
-        k_var = "k_vector_add_fast_" + nodename
-        code = """#include "cluda.h"
-        KERNEL void k_vector_add_fast(const ga_size numRowsX,
-                                      const ga_size numColsX,
-                                      const ga_ssize stridesX0,
-                                      const ga_ssize stridesX1,
-                                      GLOBAL_MEM %(type_x)s *X,
-                                      const ga_size offset_X,
-                                      const ga_size numRowsY,
-                                      const ga_size numColsY,
-                                      const ga_ssize stridesY0,
-                                      const ga_ssize stridesY1,
-                                      GLOBAL_MEM %(type_y)s *Y,
-                                      const ga_size offset_Y,
-                                      const ga_size numIndices,
-                                      const ga_ssize stridesIndices,
-                                      GLOBAL_MEM %(type_ind)s *indices_arr,
-                                      const ga_size offset_indices_arr,
-                                      const ga_int set_instead_of_inc,
-                                      GLOBAL_MEM ga_int *err)
-        {
-             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
-             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
-             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);
-             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
-             {
-                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
-                  {
-                      ga_ssize x_row = indices_arr[i * stridesIndices];
-                      if (x_row < 0)
-                          x_row += numRowsX;
-                      ga_ssize y_row = i;
-                      if (x_row < numRowsX && x_row >= 0) {
-                        if (set_instead_of_inc) {
-                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
-                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
-                        } else {
-                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
-                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
-                        }
-                      } else {
-                        *err = 1;
-                      }
-                  }
-             }
-             return;
-        }
-        """ % dict(
-            type_x=type_x, type_y=type_y, type_ind=type_ind, tc=CHARMAP[dtype_x]
-        )
-        from pygpu.gpuarray import SIZE, SSIZE
-        params = [
-            SIZE,
-            SIZE,
-            SSIZE,
-            SSIZE,
-            gpuarray.GpuArray,
-            SIZE,
-            SIZE,
-            SIZE,
-            SSIZE,
-            SSIZE,
-            gpuarray.GpuArray,
-            SIZE,
-            SIZE,
-            SSIZE,
-            gpuarray.GpuArray,
-            SIZE,
-            "int32",
-            gpuarray.GpuArray,
-        ]
-        return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
-    def c_support_code_struct(self, node, nodename):
-        return (
-            super().c_support_code_struct(node, nodename)
-            + """
-        int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
-                                     PyGpuArrayObject* py_other,
-                                     PyGpuArrayObject* indices_arr,
-                                     const int set_instead_of_inc)
-        {
-            size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256);
-            size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096);
-            gpudata *errbuf;
-            int err, kerr = 0;
-            size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
-            size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
-            size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
-            if (threads_per_block > 0 && n_blocks > 0) {
-              err = gpudata_property(py_self->ga.data,
-                                     GA_CTX_PROP_ERRBUF, &errbuf);
-              if (err != GA_NO_ERROR) {
-                PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer");
-                return 1;
-              }
-              err = k_vector_add_fast_call(
-        1, &n_blocks, &threads_per_block, 0,
-        PyGpuArray_DIMS(py_self)[0],
-        PyGpuArray_DIMS(py_self)[1],
-        PyGpuArray_STRIDES(py_self)[0] / itemsize_x,
-        PyGpuArray_STRIDES(py_self)[1] / itemsize_x,
-        py_self->ga.data,
-        py_self->ga.offset,
-        PyGpuArray_DIMS(py_other)[0],
-        PyGpuArray_DIMS(py_other)[1],
-        PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
-        PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
-        py_other->ga.data,
-        py_other->ga.offset,
-        PyGpuArray_DIMS(indices_arr)[0],
-        PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind,
-        indices_arr->ga.data,
-        indices_arr->ga.offset,
-        set_instead_of_inc,
-        errbuf);
-              if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "gpuarray error: %(k_var)s: %%s.",
-                             GpuKernel_error(&%(k_var)s, err));
-                return 1;
-              }
-              err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
-              if (err != GA_NO_ERROR) {
-                PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer");
-                return 1;
-              }
-              if (kerr != 0) {
-                PyErr_SetString(PyExc_IndexError, "Index out of bounds");
-                kerr = 0;
-                gpudata_write(errbuf, 0, &kerr, sizeof(int));
-                return 1;
-              }
-            }
-          return 0;
-        }
-        """
-            % dict(k_var="k_vector_add_fast_" + nodename)
-        )
-class GpuExtractDiag(Op):
-    __props__ = ("offset", "axis1", "axis2", "view")
-    _f16_ok = True
-    def __init__(self, offset=0, axis1=0, axis2=1, view=False):
-        self.view = view
-        if self.view:
-            self.view_map = {0: [0]}
-        self.offset = offset
-        self.axis1 = axis1
-        self.axis2 = axis2
-    def make_node(self, _x):
-        ctx_name = infer_context_name(_x)
-        x = as_gpuarray_variable(_x, ctx_name)
-        if x.ndim < 2:
-            raise ValueError("Diagonal needs an input with 2 or more " "dimensions", x)
-        axis_small, axis_large = sorted((self.axis1, self.axis2))
-        broadcastable = (
-            x.broadcastable[:axis_small]
-            + x.broadcastable[axis_small + 1 : axis_large]
-            + x.broadcastable[axis_large + 1 :]
-            + (False,)
-        )
-        return Apply(self, [x], [x.type.clone(broadcastable=broadcastable)()])
-    def perform(self, node, inputs, outputs):
-        (x,) = inputs
-        (z,) = outputs
-        # zero-dimensional matrices ...
-        if x.size == 0:
-            out_shape = [
-                d for i, d in enumerate(x.shape) if i not in (self.axis1, self.axis2)
-            ]
-            diag_size = np.min((x.shape[self.axis1], x.shape[self.axis2]))
-            out_shape.append(diag_size)
-            z[0] = node.outputs[0].type.value_zeros(tuple(out_shape))
-            return
-        # step 1) slicing on axis1 and axis2.
-        if self.offset >= 0:
-            stride_axis, slice_axis = self.axis1, self.axis2
-        else:
-            slice_axis, stride_axis = self.axis1, self.axis2
-        small_axis, large_axis = sorted((x.shape[self.axis1], x.shape[self.axis2]))
-        if x.shape[stride_axis] < x.shape[slice_axis]:
-            # in the bigger triangle
-            numstride = small_axis - np.max(
-                (0, small_axis + np.abs(self.offset) - large_axis)
-            )
-        else:
-            # in the smaller triangle
-            numstride = small_axis - np.abs(self.offset)
-        slicer = [
-            np.s_[:],
-        ] * x.ndim
-        slicer[stride_axis] = np.s_[:numstride]
-        slicer[slice_axis] = np.abs(self.offset)
-        slicer = tuple(slicer)
-        # step 2) Swap stride_axis to the last dim because we want the dim on
-        # which the diags extracted be listed as the last dim of the tensor.
-        # This is also in consistence with the interface of numpy.diagonal.
-        if slice_axis < stride_axis:
-            stride_axis -= 1
-        new_dim_order = list(range(x[slicer].ndim))
-        new_dim_order = tuple(
-            new_dim_order[:stride_axis]
-            + new_dim_order[stride_axis + 1 :]
-            + [
-                stride_axis,
-            ]
-        )
-        rval = x[slicer].transpose(new_dim_order)
-        # step 3) modify the strides in the last axis, such that rval becomes
-        # a view on the diagonal.
-        other_strides = tuple(
-            [d for i, d in enumerate(x.strides) if i not in (self.axis1, self.axis2)]
-        )
-        rval.strides = other_strides + (x.strides[self.axis1] + x.strides[self.axis2],)
-        if self.view:
-            z[0] = rval
-        else:
-            z[0] = rval.copy()
-    def grad(self, inputs, gout):
-        (input_x,) = inputs
-        return [grad_not_implemented(self, 0, input_x)]
-    def infer_shape(self, fgraph, node, shapes):
-        (in_shape,) = shapes
-        dim1 = in_shape[self.axis1]
-        dim2 = in_shape[self.axis2]
-        out_shape = [
-            d for i, d in enumerate(in_shape) if i not in (self.axis1, self.axis2)
-        ]
-        # The following logic is inspired by C code of PyArray_Diagonal().
-        offset = self.offset
-        if offset > 0:
-            diag_size = clip(dim2 - offset, 0, dim1)
-        elif offset < 0:
-            diag_size = clip(dim1 + offset, 0, dim2)
-        else:
-            diag_size = minimum(dim1, dim2)
-        out_shape.append(diag_size)
-        return [tuple(out_shape)]
-class GpuAllocDiag(AllocDiag):
-    __props__ = ("offset", "axis1", "axis2")
-    def make_node(self, diag):
-        ctx_name = infer_context_name(diag)
-        diag = as_gpuarray_variable(diag, ctx_name)
-        if diag.type.ndim < 1:
-            raise ValueError(
-                "AllocDiag needs an input with 1 or more " "dimensions", diag.type
-            )
-        return Apply(
-            self,
-            [diag],
-            [
-                diag.type.__class__(
-                    dtype=diag.dtype, broadcastable=[False] * (diag.ndim + 1)
-                )()
-            ],
-        )
-    def perform(self, node, inputs, outputs):
-        (x,) = inputs
-        (z,) = outputs
-        axis1 = np.minimum(self.axis1, self.axis2)
-        axis2 = np.maximum(self.axis1, self.axis2)
-        offset = self.offset
-        # Initialise a buffer the same size as the output
-        result_shape = x.shape[:-1] + (x.shape[-1] + abs(offset),) * 2
-        result_buffer_shape = (np.prod(x.shape[:-1]).astype(np.int64),) + (
-            x.shape[-1] + abs(offset),
-        ) * 2
-        result_buffer = gpuarray.zeros(
-            result_buffer_shape, dtype=x.dtype, context=x.context
-        )
-        # Slice out a view of the diagonals
-        if offset < 0:  # diag in the lower triangle
-            diag_view = result_buffer[:, abs(offset) :, 0]
-        else:  # diag in the upper triangle
-            diag_view = result_buffer[:, : x.shape[-1], abs(offset)]
-        diag_view.strides = (
-            diag_view.strides[0],
-            diag_view.strides[1] + x.dtype.itemsize,
-        )
-        # Fill view with flattened array of diagonals
-        diag_view[:] = x.reshape(diag_view.shape)[:]
-        # Unflatten buffer into output size
-        result = result_buffer.reshape(result_shape)
-        if len(x.shape) > 1:
-            # Re-order axes so they correspond to diagonals at axis1, axis2
-            axes = list(range(len(x.shape[:-1])))
-            last_idx = axes[-1]
-            axes = axes[:axis1] + [last_idx + 1] + axes[axis1:]
-            axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
-            result = result.transpose(axes)
-        z[0] = result
-    def grad(self, inputs, gout):
-        (gz,) = gout
-        return [
-            GpuExtractDiag(offset=self.offset, axis1=self.axis1, axis2=self.axis2)(gz)
-        ]
--- a/aesara/gpuarray/type.py
+++ b/aesara/gpuarray/type.py
-import copyreg
-import os
-import sys
-import warnings
-import numpy as np
-import aesara
-import aesara.scalar as aes
-import aesara.tensor as at
-import aesara.tensor.basic
-from aesara.compile import SharedVariable
-from aesara.configdefaults import config
-from aesara.graph.basic import Constant, Variable
-from aesara.link.c.type import CType
-from aesara.misc.safe_asarray import _asarray
-from aesara.tensor.shape import (
-    register_shape_c_code,
-    register_shape_i_c_code,
-    register_specify_shape_c_code,
-)
-from aesara.tensor.type import TensorType, complex_dtypes, discrete_dtypes
-from aesara.tensor.type import values_eq_approx as tensor_values_eq_approx
-from aesara.tensor.type import (
-    values_eq_approx_remove_inf as tensor_values_eq_approx_remove_inf,
-)
-from aesara.tensor.type import (
-    values_eq_approx_remove_inf_nan as tensor_values_eq_approx_remove_inf_nan,
-)
-from aesara.tensor.type import (
-    values_eq_approx_remove_nan as tensor_values_eq_approx_remove_nan,
-)
-from aesara.tensor.var import TensorConstantSignature, _tensor_py_operators
-# Make sure this is importable even if pygpu is absent
-# (it will not work though)
-try:
-    import pygpu
-    from pygpu import gpuarray
-    from pygpu.elemwise import compare, elemwise2
-except ImportError:
-    pygpu = None
-_context_reg = {}
-def gpu_supported(data):
-    """
-    Is the following data supported on the GPU?
-    Currently, only complex aren't supported.
-    Parameters
-    ----------
-    data : numpy.ndarray or TensorVariable
-           (it must have dtype and ndim parameter)
-    """
-    return str(data.dtype) not in complex_dtypes
-def move_to_gpu(data):
-    """
-    Do we want to move this computation to the GPU?
-    Currently, we don't move complex and scalar.
-    Parameters
-    ----------
-    data : numpy.ndarray or TensorVariable
-           (it must have dtype and ndim parameter)
-    """
-    # We don't support complex on the GPU
-    if not gpu_supported(data):
-        return False
-    # We don't want scalars on the GPU.
-    if data.ndim == 0:
-        return False
-    return True
-class ContextNotDefined(ValueError):
-    pass
-def reg_context(name, ctx):
-    """
-    Register a context by mapping it to a name.
-    The context must be of type `GpuContext` and the name can be
-    anything hashable (but is usually a string). Only one context can
-    be registered per name and the second registration for a given
-    name will raise an error.
-    Parameters
-    ----------
-    name : hashable object
-        Name to associate the context with (usually a string)
-    ctx : GpuContext
-        Context instance
-    """
-    if name in _context_reg:
-        raise ValueError(f"context name {name} is already defined")
-    if not isinstance(ctx, gpuarray.GpuContext):
-        raise TypeError("context is not GpuContext")
-    _context_reg[name] = ctx
-def get_context(name):
-    """
-    Retrieve the context associated with a name.
-    Return the context object mapped to `ref` that was previously
-    register through :func:`reg_context`. Trying to get the context
-    for an unregistered `ref` will raise a exception.
-    Parameters
-    ----------
-    name : hashable object
-        Name associated with the context we want (usually a string)
-    """
-    if name not in _context_reg:
-        raise ContextNotDefined(f"context name {name} not defined")
-    return _context_reg[name]
-def list_contexts():
-    """
-    Return an iterable of all the registered context names.
-    """
-    return _context_reg.keys()
-# Private method
-def _name_for_ctx(ctx):
-    for k, v in _context_reg.items():
-        if v == ctx:
-            return k
-    raise ContextNotDefined("context is not registered")
-# This is a private method for use by the tests only
-def _unreg_context(name):
-    del _context_reg[name]
-class GpuArrayType(CType):
-    """
-    The type that represents an array on a gpu.
-    The `dtype` indicates what scalar data type the elements of
-    variables of this type will be.
-    `broadcastable` indicates whether each dimension is broadcastable
-    or not (to be broadcastable a dimension must always be of length
-    1).
-    The `context_name` is the name of the context on will values of
-    variables of this type will be stored.
-    Parameters
-    ----------
-    dtype : str
-        The name of a numpy dtype
-    broadcastable : tuple of bools
-        A tuple that indicates both the number of dimensions (by its
-        length) and whether those dimensions are broadcastable or not
-        (by the boolean values).
-    context_name : str
-        The name of the context the that this type is attached to
-        (default: None, which is the context specified by
-        config.device).
-    name : string, optional
-        A name for the type that will be used in printouts.
-    Attributes
-    ----------
-    dtype : str
-        Data type used for scalar elements of variables.
-    broadcastable : tuple of bools
-        Indicates whether the dimensions are broadcastable or not.
-    ndim : int
-        The number of dimensions
-    context_name : str
-        The name of a gpu context on which variables will have their values.
-    name : str
-        A string used to print the type if given.
-    typecode : int
-        The gpuarray typecode for `dtype`
-    See Also
-    --------
-    aesara.graph.type.Type
-    """
-    def __init__(self, dtype, broadcastable, context_name=None, name=None):
-        # In case this was not provided and no global value is available
-        self.dtype = str(dtype)
-        self.broadcastable = tuple(bool(b) for b in broadcastable)
-        self.ndim = len(self.broadcastable)
-        self.name = name
-        self.context_name = context_name
-        # This will check that the passed context name is valid and registered.
-        get_context(self.context_name)
-        try:
-            self.typecode = gpuarray.dtype_to_typecode(self.dtype)
-        except gpuarray.GpuArrayException:
-            raise TypeError(
-                f"Unsupported dtype for {self.__class__.__name__}: {self.dtype}"
-            )
-    def clone(self, dtype=None, broadcastable=None):
-        if dtype is None:
-            dtype = self.dtype
-        if broadcastable is None:
-            broadcastable = self.broadcastable
-        return self.__class__(
-            dtype=dtype,
-            broadcastable=broadcastable,
-            context_name=self.context_name,
-            name=self.name,
-        )
-    # This is a property to keep the type pickleable
-    @property
-    def context(self):
-        """
-        The context object mapped to the type's :attr:`context_name`.
-        This is a property.
-        """
-        return get_context(self.context_name)
-    def __repr__(self):
-        # Inspired from TensorType.
-        if self.name:
-            return self.name
-        else:
-            b = self.broadcastable
-            named_broadcastable = {
-                tuple(): "scalar",
-                (False,): "vector",
-                (False, True): "col",
-                (True, False): "row",
-                (False, False): "matrix",
-            }
-            if b in named_broadcastable:
-                bcast = named_broadcastable[b]
-            elif any(b):
-                bcast = str(b)
-            else:
-                bcast = f"{len(b)}D"
-            return f"GpuArrayType<{self.context_name}>({self.dtype}, {bcast})"
-    def filter(self, data, strict=False, allow_downcast=None):
-        return self.filter_inplace(
-            data, None, strict=strict, allow_downcast=allow_downcast
-        )
-    def filter_inplace(self, data, old_data, strict=False, allow_downcast=None):
-        if isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode:
-            # This is just to make this condition not enter the
-            # following branches
-            pass
-        elif strict:
-            if not isinstance(data, gpuarray.GpuArray):
-                raise TypeError(f"{self} expected a GpuArray object.", data, type(data))
-            if self.typecode != data.typecode:
-                raise TypeError(
-                    f"{self} expected typecode {int(self.typecode)} (dtype {self.dtype}), "
-                    f"got {int(data.typecode)} (dtype {data.dtype})."
-                )
-            if self.context != data.context:
-                raise TypeError("data context does not match type context")
-            # fallthrough to ndim check
-        elif allow_downcast or (
-            allow_downcast is None
-            and isinstance(data, float)
-            and self.dtype == config.floatX
-        ):
-            if not isinstance(data, gpuarray.GpuArray):
-                data = np.array(
-                    data, dtype=self.dtype, copy=False, ndmin=len(self.broadcastable)
-                )
-            else:
-                data = gpuarray.array(
-                    data,
-                    dtype=self.typecode,
-                    copy=False,
-                    ndmin=len(self.broadcastable),
-                    context=self.context,
-                )
-        else:
-            if not hasattr(data, "dtype"):
-                converted_data = _asarray(data, self.dtype)
-                # We use the `values_eq` static function from TensorType
-                # to handle NaN values.
-                if TensorType.values_eq(
-                    np.asarray(data), converted_data, force_same_dtype=False
-                ):
-                    data = converted_data
-            up_dtype = aes.upcast(self.dtype, data.dtype)
-            if up_dtype == self.dtype:
-                if not isinstance(data, gpuarray.GpuArray):
-                    data = np.array(data, dtype=self.dtype, copy=False)
-                else:
-                    data = gpuarray.array(data, dtype=self.dtype, copy=False)
-            else:
-                raise TypeError(
-                    f"{self} cannot store a value of dtype {data.dtype} "
-                    "without risking loss of precision."
-                )
-        if self.ndim != data.ndim:
-            raise TypeError(
-                f"Wrong number of dimensions: expected {self.ndim}, "
-                f"got {data.ndim} with shape {data.shape}.",
-                data,
-            )
-        shp = data.shape
-        for i, b in enumerate(self.broadcastable):
-            if b and shp[i] != 1:
-                raise TypeError(
-                    "Non-unit value on shape on a broadcastable" " dimension.",
-                    shp,
-                    self.broadcastable,
-                )
-        if not isinstance(data, gpuarray.GpuArray):
-            if (
-                old_data is not None
-                and old_data.shape == data.shape
-                and (
-                    # write() only work if the destitation is contiguous.
-                    old_data.flags["C_CONTIGUOUS"]
-                    or old_data.flags["F_CONTIGUOUS"]
-                )
-            ):
-                old_data.write(data)
-                data = old_data
-            else:
-                data = pygpu.array(data, context=self.context)
-        return data
-    def filter_variable(self, other, allow_convert=True):
-        if hasattr(other, "_as_GpuArrayVariable"):
-            other = other._as_GpuArrayVariable(self.context_name)
-        if not isinstance(other, Variable):
-            other = self.constant_type(type=self, data=other)
-        if other.type == self:
-            return other
-        if not isinstance(other.type, (TensorType, GpuArrayType)):
-            raise TypeError("Incompatible type", (self, other.type))
-        if other.type.dtype != self.dtype:
-            raise TypeError("Incompatible dtype", (self.dtype, other.type.dtype))
-        if other.type.ndim != self.ndim:
-            raise TypeError(
-                "Incompatible number of dimensions."
-                f" Expected {int(self.ndim)}, got {int(other.ndim)}."
-            )
-        if other.type.broadcastable != self.broadcastable:
-            if allow_convert:
-                type2 = other.type.clone(broadcastable=self.broadcastable)
-                other2 = type2.convert_variable(other)
-            else:
-                other2 = None
-            if other2 is None:
-                raise TypeError(
-                    "Incompatible broadcastable dimensions."
-                    f" Expected {other.type.broadcastable}, got {self.broadcastable}."
-                )
-            other = other2
-        return other.transfer(self.context_name)
-    @staticmethod
-    def values_eq(a, b, force_same_dtype=True):
-        if a.shape != b.shape:
-            return False
-        if force_same_dtype and a.typecode != b.typecode:
-            return False
-        a_eq_b = np.asarray(compare(a, "==", b))
-        if a_eq_b.all():
-            return True
-        # maybe the trouble is that there are NaNs
-        a = np.asarray(a)
-        b = np.asarray(b)
-        a_missing = np.isnan(a)
-        if a_missing.any():
-            b_missing = np.isnan(b)
-            return np.all(a_eq_b + (a_missing == b_missing))
-        else:
-            return False
-    @staticmethod
-    def values_eq_approx(
-        a, b, allow_remove_inf=False, allow_remove_nan=False, rtol=None, atol=None
-    ):
-        return values_eq_approx(a, b, allow_remove_inf, allow_remove_nan, rtol, atol)
-    @staticmethod
-    def may_share_memory(a, b):
-        if not isinstance(a, gpuarray.GpuArray) or not isinstance(b, gpuarray.GpuArray):
-            return False
-        return pygpu.gpuarray.may_share_memory(a, b)
-    def value_zeros(self, shape):
-        return pygpu.gpuarray.zeros(shape, dtype=self.typecode, context=self.context)
-    def __eq__(self, other):
-        return (
-            type(self) == type(other)
-            and self.typecode == other.typecode
-            and self.broadcastable == other.broadcastable
-            and self.context_name == other.context_name
-        )
-    def convert_variable(self, var):
-        vt = var.type
-        if (
-            isinstance(vt, type(self))
-            and self.typecode == vt.typecode
-            and self.ndim == vt.ndim
-            and self.context_name == vt.context_name
-            and all(
-                sb == ob or ob for sb, ob in zip(self.broadcastable, vt.broadcastable)
-            )
-        ):
-            return at.patternbroadcast(var, self.broadcastable)
-    def __hash__(self):
-        return hash((type(self), self.typecode, self.broadcastable, self.context_name))
-    def dtype_specs(self):
-        """
-        Return a tuple (python type, c type, numpy typenum) that corresponds
-        to self.dtype.
-        This function is used internally as part of C code generation.
-        """
-        try:
-            return {
-                "float16": (float, "npy_float16", "NPY_FLOAT16"),
-                "float32": (float, "npy_float32", "NPY_FLOAT32"),
-                "float64": (float, "npy_float64", "NPY_FLOAT64"),
-                "bool": (int, "npy_bool", "NPY_BOOL"),
-                "uint8": (int, "npy_uint8", "NPY_UINT8"),
-                "int8": (int, "npy_int8", "NPY_INT8"),
-                "uint16": (int, "npy_uint16", "NPY_UINT16"),
-                "int16": (int, "npy_int16", "NPY_INT16"),
-                "uint32": (int, "npy_uint32", "NPY_UINT32"),
-                "int32": (int, "npy_int32", "NPY_INT32"),
-                "uint64": (int, "npy_uint64", "NPY_UINT64"),
-                "int64": (int, "npy_int64", "NPY_INT64"),
-                # 'complex128': (complex, 'aesara_complex128', 'NPY_COMPLEX128'),
-                # 'complex64': (complex, 'aesara_complex64', 'NPY_COMPLEX64')
-            }[self.dtype]
-        except KeyError:
-            raise TypeError(
-                f"Unsupported dtype for {self.__class__.__name__}: {self.dtype}"
-            )
-    def get_shape_info(self, obj):
-        return obj.shape
-    def get_size(self, shape_info):
-        if shape_info:
-            return np.prod(shape_info) * np.dtype(self.dtype).itemsize
-        else:
-            return np.dtype(self.dtype).itemsize
-    def c_element_type(self):
-        return pygpu.gpuarray.dtype_to_ctype(self.dtype)
-    def c_declare(self, name, sub, check_input=True):
-        return f"""
-        PyGpuArrayObject *{name};
-        """
-    def c_init(self, name, sub):
-        return f"{name} = NULL;"
-    def c_extract(self, name, sub, check_input=True, **kwargs):
-        # TODO I don't check broadcast stuff for now.
-        return """
-        %(name)s = NULL;
-        if (py_%(name)s == Py_None) {
-            PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
-            %(fail)s
-        }
-        /* First check if we are the base type exactly (the most common case),
-           then do the full subclass check if needed. */
-        if (py_%(name)s->ob_type != &PyGpuArrayType &&
-            !PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
-            PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
-            %(fail)s
-        }
-        %(name)s = (PyGpuArrayObject *)py_%(name)s;
-        Py_INCREF(%(name)s);
-        """ % {
-            "name": name,
-            "fail": sub["fail"],
-        }
-    def c_cleanup(self, name, sub):
-        return "Py_XDECREF({name}); {name} = NULL;".format(name=name)
-    def c_sync(self, name, sub):
-        return """
-        if (!%(name)s) {
-            Py_XDECREF(py_%(name)s);
-            Py_INCREF(Py_None);
-            py_%(name)s = Py_None;
-        } else if ((void *)py_%(name)s != (void *)%(name)s) {
-            Py_XDECREF(py_%(name)s);
-            py_%(name)s = (PyObject *)%(name)s;
-            Py_INCREF(py_%(name)s);
-        }
-        """ % {
-            "name": name
-        }
-    def c_init_code(self, **kwargs):
-        # We don't actually need the numpy API except in
-        # HostFromGpu and GpuFromHost and those case will be covered
-        # by the TensorType parameter
-        return ["import_pygpu__gpuarray();"]
-    def c_headers(self, **kwargs):
-        # We need arrayobject for the PyArrayDescr struct def
-        # (even if we just use a pointer to it in a function def)
-        return [
-            "<gpuarray/array.h>",
-            "<gpuarray/kernel.h>",
-            "<gpuarray/error.h>",
-            "<gpuarray/buffer.h>",
-            "<gpuarray/buffer_blas.h>",
-            "<numpy/arrayobject.h>",
-            "<gpuarray_api.h>",
-        ]
-    def c_header_dirs(self, **kwargs):
-        other_dirs = []
-        for dir_to_add in ("Library/include", "include"):
-            alt_inc_dir = os.path.abspath(
-                os.path.normpath(sys.exec_prefix + "/" + dir_to_add)
-            )
-            if os.path.exists(alt_inc_dir) and os.path.isdir(alt_inc_dir):
-                other_dirs.append(alt_inc_dir)
-        return [pygpu.get_include(), np.get_include()] + other_dirs
-    def c_lib_dirs(self, **kwargs):
-        dirs = []
-        for dir_to_add in ("Library/lib", "lib"):
-            alt_lib_dir = os.path.abspath(
-                os.path.normpath(sys.exec_prefix + "/" + dir_to_add)
-            )
-            if os.path.exists(alt_lib_dir) and os.path.isdir(alt_lib_dir):
-                dirs.append(alt_lib_dir)
-        return dirs
-    def c_libraries(self, **kwargs):
-        return ["gpuarray"]
-    def c_code_cache_version(self):
-        ver = pygpu.gpuarray.abi_version()
-        # we only use the major version since the minor revision are compatible.
-        return (2, ver[0])
-def values_eq_approx(
-    a, b, allow_remove_inf=False, allow_remove_nan=False, rtol=None, atol=None
-):
-    if a.shape != b.shape or a.dtype != b.dtype:
-        return False
-    if str(a.dtype) in discrete_dtypes:
-        return GpuArrayType.values_eq(a, b)
-    else:
-        if not (allow_remove_inf or allow_remove_nan):
-            atol_, rtol_ = aesara.tensor.math._get_atol_rtol(a, b)
-            if rtol is not None:
-                rtol_ = rtol
-            if atol is not None:
-                atol_ = atol
-            res = elemwise2(
-                a,
-                "",
-                b,
-                a,
-                odtype=np.dtype("bool"),
-                op_tmpl="res = (fabs(a - b) <"
-                "(%(atol_)s + %(rtol_)s * fabs(b)))" % locals(),
-            )
-            ret = np.asarray(res).all()
-            if ret:
-                return True
-        an = np.asarray(a)
-        bn = np.asarray(b)
-        return TensorType.values_eq_approx(
-            an,
-            bn,
-            allow_remove_inf=allow_remove_inf,
-            allow_remove_nan=allow_remove_nan,
-            rtol=rtol,
-            atol=atol,
-        )
-def values_eq_approx_remove_inf(a, b):
-    return values_eq_approx(a, b, True)
-def values_eq_approx_remove_nan(a, b):
-    return values_eq_approx(a, b, False, True)
-def values_eq_approx_remove_inf_nan(a, b):
-    return values_eq_approx(a, b, True, True)
-# This is to map ndarray-specific versions of these functions to the GPU.
-EQ_MAP = {
-    tensor_values_eq_approx: values_eq_approx,
-    tensor_values_eq_approx_remove_inf: values_eq_approx_remove_inf,
-    tensor_values_eq_approx_remove_nan: values_eq_approx_remove_nan,
-    tensor_values_eq_approx_remove_inf_nan: values_eq_approx_remove_inf_nan,
-}
-# Add a reverse map too.
-EQ_MAP.update(list((v, k) for k, v in EQ_MAP.items()))
-class _operators(_tensor_py_operators):
-    def _as_GpuArrayVariable(self, context_name):
-        if self.type.context_name == context_name:
-            return self
-        else:
-            from .basic_ops import GpuToGpu
-            return GpuToGpu(context_name)(self)
-@at._as_tensor_variable.register(_operators)
-def _as_tensor_operators(x, **kwargs):
-    from aesara.gpuarray.basic_ops import host_from_gpu
-    return host_from_gpu(x)
-class GpuArrayVariable(_operators, Variable):
-    """
-    A variable representing a computation on a certain GPU.
-    This supports all the operations that :class:`TensorType`
-    supports.
-    See Also
-    --------
-    Variable
-    """
-    # override the default
-    def __repr_test_value__(self):
-        return repr(np.array(aesara.graph.op.get_test_value(self)))
-GpuArrayType.variable_type = GpuArrayVariable
-class GpuArraySignature(TensorConstantSignature):
-    # might do something better if we can run the sum on the GPU, but
-    # for now this will suffice.
-    pass
-class GpuArrayConstant(_operators, Constant):
-    """
-    A constant representing a value on a certain GPU.
-    This supports all the operations that :class:`TensorType`
-    supports.
-    See Also
-    --------
-    Constant
-    """
-    def signature(self):
-        return GpuArraySignature((self.type, np.asarray(self.data)))
-    def __str__(self):
-        if self.name is not None:
-            return self.name
-        try:
-            np_data = np.asarray(self.data)
-        except gpuarray.GpuArrayException:
-            try:
-                np_data = str(self.data)
-            except Exception:
-                np_data = "Unknown"
-        return "GpuArrayConstant{%s}" % np_data
-GpuArrayType.constant_type = GpuArrayConstant
-class GpuArraySharedVariable(_operators, SharedVariable):
-    """
-    A variable representing a shared value on a certain GPU.
-    This supports all the operations that :class:`TensorType`
-    supports.
-    See Also
-    --------
-    SharedVariable
-    """
-    def get_value(self, borrow=False, return_internal_type=False):
-        if return_internal_type:
-            if borrow:
-                return self.container.value
-            else:
-                return self.container.value.copy()
-        else:
-            return np.asarray(self.container.value)
-    def set_value(self, value, borrow=False):
-        if isinstance(value, pygpu.gpuarray.GpuArray):
-            value = pygpu.gpuarray.array(
-                value, copy=(not borrow), context=self.type.context
-            )
-        self.container.value = value
-    def __getitem__(self, *args):
-        return _operators.__getitem__(self, *args)
-GpuArrayType.SharedVariable = GpuArraySharedVariable
-notset = object()
-def gpuarray_shared_constructor(
-    value,
-    name=None,
-    strict=False,
-    allow_downcast=None,
-    borrow=False,
-    broadcastable=None,
-    target=notset,
-):
-    """
-    SharedVariable constructor for GpuArrayType.
-    See :func:`aesara.shared`.
-    :target: default None
-        The device target. As None is a valid value and we need to
-        differentiate from the parameter notset and None, we use a
-        notset object.
-    """
-    if target == "cpu":
-        raise TypeError("not for me")
-    if not isinstance(value, (np.ndarray, pygpu.gpuarray.GpuArray)):
-        raise TypeError("ndarray or GpuArray required")
-    if target is notset:
-        target = None
-        if not gpu_supported(value):
-            raise TypeError("The GPU do not support that value.")
-        if not move_to_gpu(value):
-            raise TypeError("We do not move that data by default to the GPU")
-    try:
-        get_context(target)
-    except ContextNotDefined:
-        # Don't make this a hard error if we attempt to make a shared
-        # variable while there is no default context.
-        if target is None:
-            raise TypeError("No default context and no context specified")
-        raise
-    if broadcastable is None:
-        broadcastable = (False,) * value.ndim
-    type = GpuArrayType(value.dtype, broadcastable, context_name=target)
-    deviceval = pygpu.gpuarray.array(value, copy=(not borrow), context=type.context)
-    return GpuArraySharedVariable(type=type, value=deviceval, name=name, strict=strict)
-aesara.compile.register_view_op_c_code(
-    GpuArrayType,
-    """
-    Py_XDECREF(%(oname)s);
-    %(oname)s = %(iname)s;
-    Py_XINCREF(%(oname)s);
-""",
-    version=(0,),
-)
-# Register GpuArrayType C code for Shape Op.
-register_shape_c_code(
-    GpuArrayType,
-    """
-    npy_intp shape[] = {%(iname)s->ga.nd};
-    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
-    {
-        Py_XDECREF(%(oname)s);
-        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
-    }
-    for(int i=0;i<shape[0];i++)
-    {
-        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = %(iname)s->ga.dimensions[i];
-    }
-    """,
-    version=1,
-)
-register_shape_i_c_code(
-    GpuArrayType,
-    """
-    if(!%(oname)s)
-        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
-    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
-                              %(iname)s->ga.dimensions[%(i)s];
-    """,
-    """
-    if (%(i)s>=%(iname)s->ga.nd){
-        PyErr_SetString(PyExc_TypeError,
-            "Number of dimensions lower than expected");
-        %(fail)s
-    }
-    """,
-    version=(1,),
-)
-aesara.compile.register_deep_copy_op_c_code(
-    GpuArrayType,
-    """
-    Py_XDECREF(%(oname)s);
-    %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
-    if (!%(oname)s) { %(fail)s }
-""",
-    version=(5,),
-)
-aesara.tensor.basic.register_rebroadcast_c_code(
-    GpuArrayType,
-    """
-    if(%(iname)s->ga.dimensions[%(axis)s] != 1){
-        PyErr_Format(PyExc_ValueError,
-            "Dimension %(axis)s in Rebroadcast's input was"
-            " supposed to be 1 (got %%d instead)",
-            %(iname)s->ga.dimensions[%(axis)s]);
-        %(fail)s
-    }
-    """,
-    version=1,
-)
-register_specify_shape_c_code(
-    GpuArrayType,
-    """
-        if (PyGpuArray_NDIM(%(iname)s) != PyArray_DIMS(%(shape)s)[0]) {
-            PyErr_Format(PyExc_AssertionError,
-                         "SpecifyShape: vector of shape has %%d elements,"
-                         " but the input has %%d dimensions.",
-                         PyGpuArray_NDIM(%(iname)s),
-                         PyArray_DIMS(%(shape)s)[0]);
-            %(fail)s;
-        }
-        for(int i = 0; i < PyGpuArray_NDIM(%(iname)s); i++){
-            dtype_%(shape)s shp = ((dtype_%(shape)s*)PyArray_GETPTR1(%(shape)s,
-                                                                     i))[0];
-            if (PyGpuArray_DIMS(%(iname)s)[i] != shp) {
-                PyErr_Format(PyExc_AssertionError,
-                             "SpecifyShape: dim %%d of input has shape %%d,"
-                             " expected %%d.",
-                             i, PyGpuArray_DIMS(%(iname)s)[i],
-                             shp);
-                %(fail)s;
-            }
-        }
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_XINCREF(%(oname)s);
-    """,
-    version=1,
-    c_support_code_apply="#include <numpy_compat.h>",
-)
-class GpuContextType(CType):
-    """
-    Minimal type used for passing contexts to nodes.
-    This Type is not a complete type and should never be used for
-    regular graph operations.
-    """
-    def filter(self, data, strict=False, allow_downcast=None):
-        if not isinstance(data, gpuarray.GpuContext):
-            raise TypeError("context is not a GpuContext")
-        return data
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __hash__(self):
-        return hash(type(self))
-    @staticmethod
-    def values_eq(a, b):
-        return a == b
-    def c_declare(self, name, sub, check_input=True):
-        return f"PyGpuContextObject *{name};"
-    def c_init(self, name, sub):
-        return f"{name} = NULL;"
-    def c_extract(self, name, sub, check_input=True, **kwargs):
-        if check_input:
-            res = """
-if (!PyObject_TypeCheck(py_%(name)s, &PyGpuContextType)) {
-  PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
-  %(fail)s
-}
-""" % dict(
-                name=name, fail=sub["fail"]
-            )
-        else:
-            res = ""
-        return (
-            res
-            + """
-%(name)s = (PyGpuContextObject *)py_%(name)s;
-Py_INCREF(%(name)s);
-"""
-            % dict(name=name)
-        )
-    def c_cleanup(self, name, sub):
-        return f"Py_XDECREF({name}); {name} = NULL;"
-    def c_sync(self, name, sub):
-        # c_sync is intentionally not declared to prevent normal usage
-        raise NotImplementedError("Variables of this type cannot be graph outputs")
-    def c_init_code(self, **kwargs):
-        return ["import_pygpu__gpuarray();"]
-    def c_headers(self, **kwargs):
-        return ["<gpuarray_api.h>"]
-    def c_header_dirs(self, **kwargs):
-        return [pygpu.get_include()]
-    def c_code_cache_version(self):
-        ver = pygpu.gpuarray.api_version()
-        return (0, ver[0])
-    # Variable, Constant, ... not declared
-"""
-Instance of :class:`GpuContextType` to use for the context_type
-declaration of an operation.
-"""
-gpu_context_type: GpuContextType = GpuContextType()
-# THIS WORKS But GpuArray instances don't compare equal to one
-# another, and what about __hash__ ?  So the unpickled version doesn't
-# equal the pickled version, and the cmodule cache is not happy with
-# the situation. The old back-end have this same comment and use the
-# same mechanism.
-def GpuArray_unpickler(npa, ctx_name):
-    if config.experimental__unpickle_gpu_on_cpu:
-        # directly return numpy array
-        warnings.warn(
-            "config.experimental__unpickle_gpu_on_cpu is set to True. "
-            "Unpickling GpuArray as numpy.ndarray"
-        )
-        return npa
-    elif pygpu:
-        ctx = get_context(ctx_name)
-        return pygpu.gpuarray.array(npa, copy=True, context=ctx)
-    else:
-        raise ImportError("pygpu not found. Cannot unpickle GpuArray")
-copyreg.constructor(GpuArray_unpickler)
-def GpuArray_pickler(cnda):
-    ctx_name = _name_for_ctx(cnda.context)
-    return (GpuArray_unpickler, (np.asarray(cnda), ctx_name))
-# In case pygpu is not imported.
-if pygpu is not None:
-    copyreg.pickle(pygpu.gpuarray.GpuArray, GpuArray_pickler, GpuArray_unpickler)