Remove GPU references in docstrings and comments

cc584d6c · Maxim Kochurov · Brandon T. Willard · a4ed0e85 · cc584d6c · cc584d6c
--- a/aesara/compile/function/pfunc.py
+++ b/aesara/compile/function/pfunc.py
@@ -183,7 +183,7 @@ def rebuild_collect_shared(
                (store_into, update_d[store_into]),
            )

-        # filter_variable ensure smooth conversion of cpu/gpu Types
+        # filter_variable ensure smooth conversion of cpu Types
        try:
            update_val = store_into.type.filter_variable(
                update_val, allow_convert=False

--- a/aesara/compile/function/types.py
+++ b/aesara/compile/function/types.py
@@ -1097,7 +1097,7 @@ class Function:
        return [i.variable for i in self.maker.inputs if i.implicit]

    def sync_shared(self):
-        # sync was needed on old gpu backend
+        # NOTE: sync was needed on old gpu backend
        pass



--- a/aesara/compile/profiling.py
+++ b/aesara/compile/profiling.py
@@ -508,8 +508,8 @@ class ProfileStats:
            tot += t
            ftot = tot * 100 / local_time
            # Remove the useless start and end of the class name:
-            # "<class 'aesara.backend.blas.GpuDot22'>" ->
-            #  "aesara.backend.blas.GpuDot22"
+            # "<class 'aesara.backend.blas.Dot22'>" ->
+            #  "aesara.backend.blas.Dot22"
            class_name = str(a)[8:-2][:maxlen]
            print(
                format_str
@@ -887,6 +887,7 @@ class ProfileStats:
                new allocation.

            """
+            # TODO: GPU is not supported for now, needs to be refactored later
            # Initial Mem info values [CPU, GPU]
            node_memory_size = [0, 0]
            running_memory_size = [0, 0]
@@ -1241,6 +1242,7 @@ class ProfileStats:
                    max_running_max_memory_size[0], sum(running_memory[2])
                )

+                # NOTE: we do not have GPU right now, this has to be reconsidered later
                # Separate CPU and GPU
                max_node_memory_size[1] = max(
                    max_node_memory_size[1], running_memory[0][0]
@@ -1624,12 +1626,6 @@ class ProfileStats:
                    "experimental, but seems to work correctly.",
                    file=file,
                )
-                if config.device.startswith("gpu"):
-                    print(
-                        "     - MRG_RandomStream is the only random number"
-                        " generator supported on the GPU.",
-                        file=file,
-                    )
                break

        # tip 6

--- a/aesara/compile/sharedvalue.py
+++ b/aesara/compile/sharedvalue.py
@@ -120,28 +120,6 @@ class SharedVariable(Variable):

        Changes to this value will be visible to all functions using
        this SharedVariable.
-
-        Notes
-        -----
-        Set_value will work in-place on the GPU, if
-        the following conditions are met:
-
-            * The destination on the GPU must be c_contiguous.
-            * The source is on the CPU.
-            * The old value must have the same dtype as the new value
-              (which is a given for now, since only float32 is
-              supported).
-            * The old and new value must have the same shape.
-            * The old value is being completely replaced by the new
-              value (not partially modified, e.g. by replacing some
-              subtensor of it).
-
-        It is also worth mentioning that, for efficient transfer to the GPU,
-        Aesara will make the new data ``c_contiguous``. This can require an
-        extra copy of the data on the host.
-
-        The inplace on gpu memory work when borrow is either True or False.
-
        """
        if borrow:
            self.container.value = new_value

--- a/aesara/d3viz/d3viz.py
+++ b/aesara/d3viz/d3viz.py
@@ -50,9 +50,7 @@ def d3viz(fct, outfile, copy_deps=True, *args, **kwargs):
    edited by selecting Edit from the context menu.

    Input nodes are colored in green, output nodes in blue. Apply nodes are
-    ellipses, and colored depending on the type of operation they perform. Red
-    ellipses are transfers from/to the GPU (ops with names GpuFromHost,
-    HostFromGpu).
+    ellipses, and colored depending on the type of operation they perform.

    Edges are black by default. If a node returns a view of an
    input, the input edge will be blue. If it returns a destroyed input, the

--- a/aesara/d3viz/formatting.py
+++ b/aesara/d3viz/formatting.py
@@ -52,8 +52,6 @@ class PyDotFormatter:
            "unused": "lightgrey",
        }
        self.apply_colors = {
-            "GpuFromHost": "red",
-            "HostFromGpu": "red",
            "Scan": "yellow",
            "Shape": "cyan",
            "IfElse": "magenta",

--- a/aesara/gradient.py
+++ b/aesara/gradient.py
@@ -237,7 +237,7 @@ def Rop(f, wrt, eval_points, disconnected_outputs="raise", return_disconnected="
                )
        except AttributeError:
            # wrt_elem and eval_point don't always have ndim like random type
-            # Tensor, Sparse and GpuArray have the ndim attribute
+            # Tensor, Sparse have the ndim attribute
            pass

    seen_nodes = OrderedDict()

--- a/aesara/graph/type.py
+++ b/aesara/graph/type.py
@@ -107,14 +107,13 @@ class Type(MetaObject):
        This method allows one to reuse old allocated memory.  If this method
        is implemented, it will be called instead of `Type.filter`.

-        As of now, this method is only used when we transfer new data to a
-        shared variable on a GPU.
+        As of now, this method is not implemented and was previously used for transferring memory to and from GPU.

        Parameters
        ----------
        value: array-like
        storage: array-like
-            The old value (e.g. the old NumPy array, CudaNdarray, etc.)
+            The old value (e.g. the old NumPy array)
        strict: bool
        allow_downcast: bool (optional)


--- a/aesara/link/c/cmodule.py
+++ b/aesara/link/c/cmodule.py
@@ -1189,7 +1189,7 @@ class ModuleCache:
            # 2) If other repo that import Aesara have Aesara ops defined,
            #    we need to refresh the cache here. Otherwise, there are import
            #    order problems.
-            #    When device=gpu, we compile during Aesara
+            #    (Outdated) When device=gpu, we compile during Aesara
            #    import. This triggers the loading of the cache. But
            #    unpickling the cache asks that the external Ops are
            #    completely loaded, which isn't always the case!

--- a/aesara/misc/check_blas.py
+++ b/aesara/misc/check_blas.py
@@ -67,10 +67,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
            impl = "CPU (with direct Aesara binding to blas)"
        else:
            impl = "CPU (without direct Aesara binding to blas but with numpy/scipy binding to blas)"
-    elif any(x.op.__class__.__name__ == "GpuGemm" for x in f.maker.fgraph.toposort()):
-        impl = "GPU"
    else:
-        impl = "ERROR, unable to tell if Aesara used the cpu or the gpu:\n"
+        impl = "ERROR, unable to tell if Aesara used the cpu:\n"
        impl += str(f.maker.fgraph.toposort())

    t0 = 0
@@ -78,7 +76,7 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=

    f()  # Ignore first function call to get representative time.
    if execute:
-        # sync was needed for gpu
+        # NOTE: sync was needed for gpu
        sync = False

        if sync:

--- a/aesara/printing.py
+++ b/aesara/printing.py
@@ -1014,8 +1014,6 @@ Print to the terminal a math-like expression.
 # colors not used: orange, amber#FFBF00, purple, pink,
 # used by default: green, blue, grey, red
 default_colorCodes = {
-    "GpuFromHost": "red",
-    "HostFromGpu": "red",
    "Scan": "yellow",
    "Shape": "brown",
    "IfElse": "magenta",

--- a/aesara/sandbox/fourier.py
+++ b/aesara/sandbox/fourier.py
@@ -19,8 +19,7 @@ from aesara.tensor.type import zmatrix

 message = (
    "The module aesara.sandbox.fourier will soon be deprecated."
-    " Please use aesara.tensor.fft, which supports gradients and "
-    "automatic optimization transfers to the GPU ops."
+    " Please use aesara.tensor.fft, which supports gradients."
 )
 warnings.warn(message)


--- a/aesara/sandbox/rng_mrg.py
+++ b/aesara/sandbox/rng_mrg.py
@@ -394,13 +394,13 @@ class mrg_uniform(COp, mrg_uniform_base):
        for s in size:
            n_elements *= s
        if n_elements > M1:
-            # The limit is on the C and GPU code. This perform don't
+            # The limit is on the C code. This perform don't
            # have this limit.  But to have all of them behave the
            # same (and have DebugMode don't use too much memory for
            # some rng_mrg tests) I also add this limit here.
            raise ValueError("rng_mrg does not support more then (2**31 -1) samples")

-        rstate = np.asarray(rstate)  # bring state from GPU if necessary
+        rstate = np.asarray(rstate)  # bring state from XXX if necessary
        if not self.inplace:
            rstate = rstate.copy()

@@ -527,8 +527,7 @@ class mrg_uniform(COp, mrg_uniform_base):

    def c_code(self, node, name, inp, out, sub):
        # If we try to use the C code here with something else than a
-        # TensorType, something is wrong (likely one of the GPU ops
-        # not defining C code correctly).
+        # TensorType, something is wrong.
        assert isinstance(node.inputs[0].type, TensorType)
        if self.output_type.dtype == "float16":
            # C code is not tested, fall back to Python

--- a/aesara/scan/__init__.py
+++ b/aesara/scan/__init__.py
@@ -26,8 +26,6 @@ of using ``scan`` over `for` loops in python (among others) are:
 * it allows computing gradients through the for loop
 * there exist a bunch of optimizations that help re-write your loop
  such that less memory is used and that it runs faster
-* it ensures that data is not copied from host to gpu and gpu to
-  host at each step

 The Scan Op should typically be used by calling any of the following
 functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,

--- a/aesara/scan/basic.py
+++ b/aesara/scan/basic.py
@@ -277,10 +277,6 @@ def scan(
        allocations are freed at the end of all iterations; this is what the
        flag `aesara.config.allow_gc` means.

-        If you use pre-allocation and this `Scan` is on GPU, the speed up from
-        `allow_gc` is small. If you are missing memory, disabling `allow_gc`
-        could help you run graph that request much memory.
-
    strict
        If ``True``, all the shared variables used in `fn` must be provided as a
        part of `non_sequences` or `sequences`.

--- a/aesara/scan/op.py
+++ b/aesara/scan/op.py
@@ -1714,7 +1714,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                elif isinstance(self.fn.maker.fgraph.outputs[idx], TensorVariable):
                    old_inner_output_data[idx] = var.data
                else:
-                    raise RuntimeError("old_inner_output_data[idx] = var.gpudata")
+                    raise RuntimeError(
+                        "FIXME: old_inner_output_data[idx] = var.gpudata"
+                    )

            # 4.6. Keep a reference to the variables (ndarrays,
            # etc) associated with mitmot inputs currently in the
@@ -1849,7 +1851,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                            output_reused = new_var.data == old_data
                        else:
                            raise RuntimeError(
-                                "output_reused = new_var.gpudata == old_data"
+                                "FIXME: output_reused = new_var.gpudata == old_data"
                            )
                    else:
                        output_reused = False
@@ -1915,7 +1917,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                            output_reused = new_var.data == old_data
                        else:
                            raise RuntimeError(
-                                "output_reused = new_var.gpudata == old_data"
+                                "FIXME: output_reused = new_var.gpudata == old_data"
                            )
                    else:
                        output_reused = False

--- a/aesara/tensor/basic.py
+++ b/aesara/tensor/basic.py
@@ -1649,11 +1649,6 @@ class Alloc(COp):
                )
            ):
                return False
-            # If the clients is a transfer to the GPU, we don't want to
-            # fold. We let the Alloc being moved to the GPU, then we
-            # let the GPU algo decide if it need to fold it or not.
-            elif client[0].op.__class__.__name__.lower().startswith("gpu"):
-                return False
        return True


@@ -2215,8 +2210,7 @@ def addbroadcast(x, *axes):
    x broadcastable. When performing the function, if the length of
    x along that dimension is not 1, a ValueError will be raised.

-    We apply the opt here not to pollute the graph especially during
-    the gpu optimization
+    We apply the opt here not to pollute the graph

    Parameters
    ----------
@@ -2252,8 +2246,7 @@ def unbroadcast(x, *axes):
    of x broadcastable. When performing the function, if the length
    of x along that dimension is not 1, a ValueError will be raised.

-    We apply the opt here not to pollute the graph especially during
-    the gpu optimization
+    We apply the opt here not to pollute the graph

    Parameters
    ----------

--- a/aesara/tensor/basic_opt.py
+++ b/aesara/tensor/basic_opt.py
@@ -169,7 +169,7 @@ def broadcast_like(value, template, fgraph, dtype=None):

 class InplaceElemwiseOptimizer(GlobalOptimizer):
    r"""
-    This is parameterized so that it works for `Elemwise` and `GpuElemwise` `Op`\s.
+    This is parameterized so that it works for `Elemwise` `Op`\s.
    """

    def __init__(self, OP):
@@ -1343,8 +1343,7 @@ class ShapeFeature(features.Feature):
                if repl.owner is shpnode:
                    # This mean the replacement shape object is
                    # exactly the same as the current shape object. So
-                    # no need for replacement. This happen for example
-                    # with the InputToGpuOptimizer optimizer.
+                    # no need for replacement.
                    continue
                if (
                    repl.owner
@@ -1841,9 +1840,7 @@ def local_alloc_empty_to_zeros(fgraph, node):

    This help investigate NaN with NanGuardMode.  Not registered by
    default. To activate it, use the Aesara flag
-    optimizer_including=alloc_empty_to_zeros. This also enable
-    the GPU version of this optimizations.
-
+    optimizer_including=alloc_empty_to_zeros.
    """
    if isinstance(node.op, AllocEmpty):
        return [zeros(node.inputs, dtype=node.outputs[0].dtype)]
@@ -3000,19 +2997,15 @@ def local_elemwise_fusion_op(op_class, max_input_fct=lambda node: 32, maker=None
    and each `Elemwise`'s scalar `Op`, and use the composite scalar `Op` in a
    new "fused" `Elemwise`.

-    It's parameterized in order to work for `Elemwise` and `GpuElemwise` `Op`\s.
+    It's parameterized in order to work for `Elemwise` `Op`\s.

    Parameters
    ----------
    op_class : type
-        `GpuElemwise` or `Elemwise` class (the one that we want to fuse)
+        `Elemwise` class (the one that we want to fuse)
    max_input_fct : callable
        A function that returns the maximum number of inputs that this `Elemwise`
-        can take (useful for `GpuElemwise`).  The GPU kernel currently has a
-        limit of 256 bytes for the size of all parameters passed to it. As
-        currently we pass a lot of information only by parameter, we must limit how
-        many `Op`\s we fuse together to avoid busting that 256 limit.
-
+        can take.
        On the CPU we limit to 32 input variables since that is the maximum
        NumPy support.


--- a/aesara/tensor/extra_ops.py
+++ b/aesara/tensor/extra_ops.py
@@ -1192,7 +1192,7 @@ def to_one_hot(y, nb_class, dtype=None):

 class Unique(Op):
    """
-    Wraps `numpy.unique`. This `Op` is not implemented on the GPU.
+    Wraps `numpy.unique`.

    Examples
    --------

--- a/aesara/tensor/fourier.py
+++ b/aesara/tensor/fourier.py
@@ -22,8 +22,7 @@ from aesara.tensor.var import TensorConstant
 class Fourier(Op):
    """
    WARNING: for officially supported FFTs, use aesara.tensor.fft, which
-    provides real-input FFTs. Gradients are supported, as well as optimization
-    transfers to GPU ops.
+    provides real-input FFTs. Gradients are supported.

    An instance of this class returns a finite fourier transform calculated
    along one dimension of an input array.

--- a/aesara/tensor/math.py
+++ b/aesara/tensor/math.py
@@ -1550,11 +1550,6 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, acc_dtype=None)
        necessarily be the dtype of the output (in particular
        if it is a discrete (int/uint) dtype, the output will
        be in a float type). If None, then we use the same rules as `sum()`.
-
-    Notes
-    -----
-    For gpu, if you specify dtype=float32, everything will be done on the gpu.
-
    """
    input = as_tensor_variable(input)
    if op:

--- a/aesara/tensor/math_opt.py
+++ b/aesara/tensor/math_opt.py
@@ -1673,8 +1673,7 @@ def local_reduce_broadcastable(fgraph, node):
            axis = list(node.op.axis)
            cuttable = [a for a in axis if reduced.broadcastable[a]]
            if cuttable:
-                # -- we can remove some axes of summation,
-                #    which simplifies the codegen for sum, especially on GPU
+                # -- we can remove some axes of summation.
                new_axis = []
                pattern = []
                ii = 0
@@ -1857,10 +1856,6 @@ def local_pow_canonicalize(fgraph, node):
 def local_mul_to_sqr(fgraph, node):
    """
    x*x -> sqr(x)
-
-    This is faster on the GPU when memory fetching is a big part of
-    the computation time.
-
    """
    if node.op == mul:
        if len(node.inputs) == 2:

--- a/aesara/tensor/nlinalg.py
+++ b/aesara/tensor/nlinalg.py
@@ -193,11 +193,6 @@ def matrix_dot(*args):
 def trace(X):
    """
    Returns the sum of diagonal elements of matrix X.
-
-    Notes
-    -----
-    Works on GPU since 0.6rc4.
-
    """
    return extract_diag(X).sum()

@@ -729,7 +724,6 @@ class TensorInv(Op):

 def tensorinv(a, ind=2):
    """
-    Does not run on GPU;
    Aesara utilization of numpy.linalg.tensorinv;

    Compute the 'inverse' of an N-dimensional array.
@@ -791,7 +785,7 @@ class TensorSolve(Op):

 def tensorsolve(a, b, axes=None):
    """
-    Aesara utilization of numpy.linalg.tensorsolve. Does not run on GPU!
+    Aesara utilization of numpy.linalg.tensorsolve.

    Solve the tensor equation ``a x = b`` for x.
    It is assumed that all indices of `x` are summed over in the product,

--- a/aesara/tensor/nnet/abstract_conv.py
+++ b/aesara/tensor/nnet/abstract_conv.py
@@ -1048,16 +1048,6 @@ def conv3d(
        Set of feature maps generated by convolutional layer. Tensor is
        is of shape (batch size, output channels, output depth,
        output rows, output columns)
-
-    Notes
-    -----
-        If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
-        "caffe style convolution".
-
-        This is only supported in Aesara 0.8 or the development
-        version until it is released.
-
    """
    input = as_tensor_variable(input)
    filters = as_tensor_variable(filters)
@@ -1184,17 +1174,6 @@ def conv2d_grad_wrt_inputs(
        set of feature maps generated by convolutional layer. Tensor
        is of shape (batch size, output channels, output rows, output
        columns)
-
-    Notes
-    -----
-
-    :note: If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
-        "caffe style convolution".
-
-    :note: This is only supported in Aesara 0.8 or the development
-        version until it is released.
-
    """

    filters = as_tensor_variable(filters)
@@ -1347,17 +1326,6 @@ def conv3d_grad_wrt_inputs(
        set of feature maps generated by convolutional layer. Tensor
        is of shape (batch size, output channels, output depth,
        output rows, output columns)
-
-    Notes
-    -----
-
-    :note: If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
-        "caffe style convolution".
-
-    :note: This is only supported in Aesara 0.8 or the development
-        version until it is released.
-
    """

    filters = as_tensor_variable(filters)
@@ -1500,17 +1468,6 @@ def conv2d_grad_wrt_weights(
        columns) for normal convolution and
        (output channels, output rows, output columns, input channels,
        filter rows, filter columns) for unshared convolution
-
-    Notes
-    -----
-
-    :note: If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
-        "caffe style convolution".
-
-    :note: This is only supported in Aesara 0.8 or the development
-        version until it is released.
-
    """

    input = as_tensor_variable(input)
@@ -1644,17 +1601,6 @@ def conv3d_grad_wrt_weights(
        set of feature maps generated by convolutional layer. Tensor
        is of shape (batch size, output channels, output time, output
        rows, output columns)
-
-    Notes
-    -----
-
-    :note: If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
-        "caffe style convolution".
-
-    :note: This is only supported in Aesara 0.8 or the development
-        version until it is released.
-
    """

    input = as_tensor_variable(input)
@@ -3685,19 +3631,6 @@ def conv2d(
    Symbolic 4D tensor
        Set of feature maps generated by convolutional layer. Tensor is
        of shape (batch size, output channels, output rows, output columns)
-
-    Notes
-    -----
-        If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
-        "caffe style convolution".
-
-        This is only supported in Aesara 0.8 or the development
-        version until it is released.
-
-        The parameter filter_dilation is an implementation of `dilated
-        convolution <https://arxiv.org/pdf/1511.07122v3.pdf>`_.
-
    """

    if "imshp_logical" in kwargs or "kshp_logical" in kwargs:
@@ -3822,18 +3755,6 @@ def conv2d_transpose(
    Symbolic 4D tensor
        Set of feature maps generated by the transposed convolution. Tensor is
        of shape (batch size, output channels, output rows, output columns)
-
-    Notes
-    -----
-        If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
-        "caffe style convolution".
-
-        This operation is also sometimes called "deconvolution".
-
-        The parameter filter_dilation is an implementation of `dilated
-        convolution <https://arxiv.org/pdf/1511.07122v3.pdf>`_.
-
    """

    return conv2d_grad_wrt_inputs(

--- a/aesara/tensor/nnet/basic.py
+++ b/aesara/tensor/nnet/basic.py
@@ -4,13 +4,6 @@ Provides neural-network specific Ops.
 Notes
 -----
 TODO: factor this out into a neural-network toolbox.
-
-We register all optimization with the gpu tag as we don't
-implement all the intermediate case on the GPU (in particular
-AdvancedSubtensor). So to make sure it run well on the gpu with
-fast_compile, we register them as needed for the GPU. This can be
-revisited later when all the intermediate part are on the GPU.
-
 """

 import warnings

--- a/aesara/tensor/nnet/batchnorm.py
+++ b/aesara/tensor/nnet/batchnorm.py
@@ -47,7 +47,6 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"):
    """
    This function will build the symbolic graph for applying batch normalization
    to a set of activations.
-    Also works on GPUs, but is not optimized using cuDNN.

    .. versionadded:: 0.7.1


--- a/aesara/tensor/nnet/conv3d2d.py
+++ b/aesara/tensor/nnet/conv3d2d.py
@@ -44,9 +44,6 @@ class DiagonalSubtensor(Op):
    i1
        Axis index in x

-    Notes
-    -----
-    Work on the GPU.

    Extended summary
    ----------------
@@ -204,8 +201,6 @@ def conv3d(
    Another way to define signals: (batch,  time, in channel, row, column)
    Another way to define filters: (out channel,time,in channel, row, column)

-    For the GPU, use nnet.conv3d.
-
    See Also
    --------
    Someone made a script that shows how to swap the axes between

--- a/aesara/tensor/nnet/neighbours.py
+++ b/aesara/tensor/nnet/neighbours.py
@@ -194,7 +194,7 @@ class Images2Neibs(COp):
    def perform(self, node, inp, out_, params):
        ten4, neib_shape, neib_step = inp
        (z,) = out_
-        # GpuImages2Neibs should not run this perform in DebugMode
+        # XXX: GpuImages2Neibs should not run this perform in DebugMode
        if not isinstance(self, Images2Neibs):
            raise aesara.graph.utils.MethodNotDefined()


--- a/aesara/tensor/nnet/opt.py
+++ b/aesara/tensor/nnet/opt.py
@@ -591,9 +591,7 @@ def local_abstractconv_check(fgraph, node):
    ):
        raise LocalMetaOptimizerSkipAssertionError(
            f"{node.op.__class__.__name__} Aesara optimization failed: there is no implementation "
-            "available supporting the requested options. Did you exclude "
-            'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, '
-            "is cuDNN available and does the GPU support it? If on CPU, "
+            "available supporting the requested options. If on CPU, "
            "do you have a BLAS library installed Aesara can link against? "
            "On the CPU we do not support float16."
        )

--- a/aesara/tensor/signal/pool.py
+++ b/aesara/tensor/signal/pool.py
@@ -146,12 +146,7 @@ def pool_2d(
            "pool_2d() will have the parameter ignore_border"
            " default value changed to True (currently"
            " False). To have consistent behavior with all Aesara"
-            " version, explicitly add the parameter ignore_border=True."
-            " On the GPU, using ignore_border=True is needed to use cuDNN."
-            " When using ignore_border=False and not using cuDNN, the only"
-            " GPU combination supported is when"
-            " `ws == stride and pad == (0, 0) and mode == 'max'`."
-            " Otherwise, the convolution will be executed on CPU.",
+            " version, explicitly add the parameter ignore_border=True.",
            category=DeprecationWarning,
            stacklevel=2,
        )
@@ -267,12 +262,7 @@ def pool_3d(
            "pool_3d() will have the parameter ignore_border"
            " default value changed to True (currently"
            " False). To have consistent behavior with all Aesara"
-            " version, explicitly add the parameter ignore_border=True."
-            " On the GPU, using ignore_border=True is needed to use cuDNN."
-            " When using ignore_border=False and not using cuDNN, the only"
-            " GPU combination supported is when"
-            " `ws == stride and pad == (0, 0, 0) and mode == 'max'`."
-            " Otherwise, the convolution will be executed on CPU.",
+            " version, explicitly add the parameter ignore_border=True.",
            category=DeprecationWarning,
            stacklevel=2,
        )

--- a/aesara/tensor/slinalg.py
+++ b/aesara/tensor/slinalg.py
@@ -429,8 +429,6 @@ def solve_triangular(
 class Solve(SolveBase):
    """
    Solve a system of linear equations.
-
-    For on CPU and GPU.
    """

    __props__ = (

--- a/aesara/tensor/sort.py
+++ b/aesara/tensor/sort.py
@@ -328,12 +328,10 @@ class TopKOp(Op):

    Notes
    -----
-    - CPU and GPU ops don't produce same output order. This is expected.
    - The output order is not guaranteed. On the CPU, we use
      ``np.partition`` and ``np.argpartition`` that only make sure the
      k-th element is the correct one and that the other
-      elements are on the correct side. On the GPU, they
-      look sorted, but we do not test the correctness of this behavior.
+      elements are on the correct side.
    - By default, this Op gives two outputs: values and indices. However
      optimizers may remove a certain output if not needed.
    - Computing the gradient requests the computation of the indices in

--- a/aesara/updates.py
+++ b/aesara/updates.py
@@ -50,15 +50,10 @@ class OrderedUpdates(OrderedDict):
            # TODO: consider doing error-checking on value.
            # insist that it is an Aesara variable? Have the right type?
            # This could have weird consequences - for example a
-            # GPU SharedVariable is customarily associated with a TensorType
-            # value. Should it be cast to a GPU value right away?  Should
-            # literals be transformed into constants immediately?

            return super().__setitem__(key, value)
        else:
-            raise TypeError(
-                "OrderedUpdates keys must inherit from " "SharedVariable", key
-            )
+            raise TypeError("OrderedUpdates keys must inherit from SharedVariable", key)

    def update(self, other=None):
        if other is None: