Remove gpuarray dependencies throughout the codebase

0e3182d1 · Maxim Kochurov · Brandon T. Willard · 2a5fc594 · 0e3182d1 · 0e3182d1
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,6 @@ repos:
              aesara/compile/nanguardmode\.py|
              aesara/graph/opt\.py|
              aesara/tensor/var\.py|
-              aesara/gpuarray/opt\.py
          )$
      - id: check-merge-conflict
  - repo: https://github.com/psf/black

--- a/DESCRIPTION.txt
+++ b/DESCRIPTION.txt
 Aesara is a Python library that allows you to define, optimize, and efficiently evaluate mathematical expressions involving multi-dimensional arrays. It is built on top of NumPy_. Aesara features:

 * **tight integration with NumPy:** a similar interface to NumPy's. numpy.ndarrays are also used internally in Aesara-compiled functions.
- * **transparent use of a GPU:** perform data-intensive computations up to 140x faster than on a CPU (support for float32 only).
 * **efficient symbolic differentiation:** Aesara can compute derivatives for functions of one or many inputs.
 * **speed and stability optimizations:** avoid nasty bugs when computing expressions such as log(1 + exp(x)) for large values of x.
 * **dynamic C code generation:** evaluate expressions faster.

--- a/aesara/__init__.py
+++ b/aesara/__init__.py
@@ -144,16 +144,6 @@ from aesara.updates import OrderedUpdates
 # isort: on


-if (
-    config.device.startswith("cuda")
-    or config.device.startswith("opencl")
-    or config.init_gpu_device.startswith("cuda")
-    or config.init_gpu_device.startswith("opencl")
-    or config.contexts != ""
-):
-    import aesara.gpuarray
-
-
 def get_scalar_constant_value(v):
    """Return the constant scalar (i.e. 0-D) value underlying variable `v`.


--- a/aesara/compile/debugmode.py
+++ b/aesara/compile/debugmode.py
@@ -752,16 +752,6 @@ def _get_preallocated_maps(
    Preallocate outputs in different memory layouts.

    """
-
-    # To avoid circular imports
-    from aesara.gpuarray import GpuArrayType
-    from aesara.tensor.type import TensorType
-
-    try:
-        import pygpu
-    except ImportError:
-        pass
-
    # TODO: Sparse? Scalar does not really make sense.

    # Do not preallocate memory for outputs that actually work inplace
@@ -795,11 +785,12 @@ def _get_preallocated_maps(
            # I'm not sure why it is legitimate, but there are tests about it.
            # So, we cannot fill r_vals[r] with def_val yet, we have to wait
            # until all output values are deepcopied.
+        from aesara.tensor import TensorType

        for r in considered_outputs:
            # There is no risk to overwrite inputs, since r does not work
            # inplace.
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                reuse_outputs[r][...] = np.asarray(def_val).astype(r.type.dtype)

        if reuse_outputs:
@@ -812,7 +803,7 @@ def _get_preallocated_maps(
    if "c_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
        c_cont_outputs = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                # Build a C-contiguous buffer
                new_buf = r.type.value_zeros(r_vals[r].shape)
                assert new_buf.flags["C_CONTIGUOUS"]
@@ -829,13 +820,11 @@ def _get_preallocated_maps(
    if "f_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
        f_cont_outputs = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                new_buf = np.zeros(
                    shape=r_vals[r].shape, dtype=r_vals[r].dtype, order="F"
                )
                new_buf[...] = def_val
-                if isinstance(r.type, GpuArrayType):
-                    new_buf = pygpu.array(new_buf)

                f_cont_outputs[r] = new_buf

@@ -859,7 +848,7 @@ def _get_preallocated_maps(
        max_ndim = 0
        rev_out_broadcastable = []
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                if max_ndim < r.ndim:
                    rev_out_broadcastable += [True] * (r.ndim - max_ndim)
                    max_ndim = r.ndim
@@ -874,7 +863,7 @@ def _get_preallocated_maps(
        # Initial allocation
        init_strided = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                # Create a buffer twice as large in every dimension,
                # except if broadcastable, or for dimensions above
                # config.DebugMode__check_preallocated_output_ndim
@@ -953,7 +942,7 @@ def _get_preallocated_maps(
                name = f"wrong_size{tuple(shape_diff)}"

                for r in considered_outputs:
-                    if isinstance(r.type, (TensorType, GpuArrayType)):
+                    if isinstance(r.type, TensorType):
                        r_shape_diff = shape_diff[: r.ndim]
                        out_shape = [
                            max((s + sd), 0)

--- a/aesara/compile/function/types.py
+++ b/aesara/compile/function/types.py
@@ -1097,13 +1097,8 @@ class Function:
        return [i.variable for i in self.maker.inputs if i.implicit]

    def sync_shared(self):
-        if hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated:
-            import pygpu
-
-            for i in self.maker.fgraph.update_mapping.values():
-                inp = self.input_storage[i]
-                if isinstance(inp.data, pygpu.gpuarray.GpuArray):
-                    inp.data.sync()
+        # sync was needed on old gpu backend
+        pass


 # pickling/deepcopy support for Function

--- a/aesara/compile/nanguardmode.py
+++ b/aesara/compile/nanguardmode.py
@@ -5,24 +5,11 @@ from io import StringIO
 import numpy as np

 import aesara
-from aesara.compile.mode import Mode, get_mode
+from aesara.compile.mode import Mode
 from aesara.configdefaults import config
-from aesara.tensor.math import abs as at_abs
-from aesara.tensor.math import max as at_max
-from aesara.tensor.math import min as at_min
 from aesara.tensor.type import discrete_dtypes


-try:
-    from pygpu.gpuarray import GpuArray
-
-    from aesara.gpuarray.type import GpuArrayType, _name_for_ctx
-
-    pygpu_available = True
-except ImportError:
-    pygpu_available = False
-
-
 logger = logging.getLogger("aesara.compile.nanguardmode")


@@ -114,9 +101,6 @@ def contains_nan(arr, node=None, var=None):
        return False
    elif getattr(arr, "dtype", "") in discrete_dtypes:
        return False
-    elif pygpu_available and isinstance(arr, GpuArray):
-        return np.isnan(f_gpua_min(arr.reshape(arr.size)))
-
    return np.isnan(np.min(arr))


@@ -149,36 +133,9 @@ def contains_inf(arr, node=None, var=None):
        return False
    elif getattr(arr, "dtype", "") in discrete_dtypes:
        return False
-    elif pygpu_available and isinstance(arr, GpuArray):
-        return np.isinf(f_gpua_min(arr.reshape(arr.size))) or np.isinf(
-            f_gpua_max(arr.reshape(arr.size))
-        )
-
    return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))


-def f_compute(op):
-    def result(inp):
-        dtype = inp.dtype
-        ctx_name = _name_for_ctx(inp.context)
-        key = (dtype, ctx_name)
-        f = result.cache.get(key, None)
-        if f is None:
-            guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
-            mode = get_mode("FAST_RUN").including("gpuarray")
-            f = aesara.function([guard_in], op(guard_in), mode=mode, profile=False)
-            result.cache[key] = f
-        return f(inp)
-
-    result.cache = dict()
-    return result
-
-
-f_gpua_min = f_compute(at_min)
-f_gpua_max = f_compute(at_max)
-f_gpua_absmax = f_compute(lambda x: at_max(at_abs(x)))
-
-
 class NanGuardMode(Mode):
    """
    A Aesara compilation Mode that makes the compiled function automatically
@@ -252,8 +209,6 @@ class NanGuardMode(Mode):
                err = False
                if not _is_numeric_value(value, var):
                    err = False
-                elif pygpu_available and isinstance(value, GpuArray):
-                    err = f_gpua_absmax(value.reshape(value.size)) > 1e10
                else:
                    err = np.abs(value).max() > 1e10
                if err:

--- a/aesara/compile/profiling.py
+++ b/aesara/compile/profiling.py
@@ -12,10 +12,8 @@ import atexit
 import copy
 import logging
 import operator
-import os
 import sys
 import time
-import warnings
 from collections import defaultdict
 from typing import Dict, List

@@ -279,40 +277,7 @@ class ProfileStats:

    # param is called flag_time_thunks because most other attributes with time
    # in the name are times *of* something, rather than configuration flags.
-    def __init__(
-        self, atexit_print=True, flag_time_thunks=None, gpu_checks=True, **kwargs
-    ):
-        if (
-            gpu_checks
-            and (hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated)
-            and os.environ.get("CUDA_LAUNCH_BLOCKING", "0") != "1"
-        ):
-            msg = (
-                "You are running the Aesara profiler with CUDA enabled."
-                " Aesara GPU ops execution is asynchronous by default."
-                " So by default, the profile is useless."
-                " You must set the environment variable"
-                " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
-                " synchronize the execution to get a meaningful profile."
-            )
-            if config.profile:
-                raise Exception(msg)
-            else:
-                warnings.warn(msg)
-
-        if (
-            config.profile
-            and gpu_checks
-            and hasattr(aesara, "gpuarray")
-            and aesara.gpuarray.pygpu_activated
-            and not config.profiling__ignore_first_call
-        ):
-            warnings.warn(
-                "Aesara flag profiling__ignore_first_call is False. "
-                "This cause bad profiling result in the gpu "
-                "back-end, as sometimes we compile at the first call."
-            )
-
+    def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
        self.apply_callcount = {}
        self.output_size = {}
        # Keys are `(FunctionGraph, Variable)`
@@ -543,8 +508,8 @@ class ProfileStats:
            tot += t
            ftot = tot * 100 / local_time
            # Remove the useless start and end of the class name:
-            # "<class 'aesara.gpuarray.blas.GpuDot22'>" ->
-            #  "aesara.gpuarray.blas.GpuDot22"
+            # "<class 'aesara.backend.blas.GpuDot22'>" ->
+            #  "aesara.backend.blas.GpuDot22"
            class_name = str(a)[8:-2][:maxlen]
            print(
                format_str
@@ -922,8 +887,6 @@ class ProfileStats:
                new allocation.

            """
-            from aesara.gpuarray import GpuArrayType
-
            # Initial Mem info values [CPU, GPU]
            node_memory_size = [0, 0]
            running_memory_size = [0, 0]
@@ -973,10 +936,8 @@ class ProfileStats:
                # allocated by the node
                idx2 = 0
                for out in node.outputs:
-                    if isinstance(out.type, GpuArrayType):
-                        cg = 1
-                    else:
-                        cg = 0
+                    # NOTE: cg=1 was used for GPU
+                    cg = 0
                    ins = None
                    if dmap and idx2 in dmap:
                        vidx = dmap[idx2]
@@ -1021,10 +982,8 @@ class ProfileStats:
                for ins in set(node.inputs):
                    assert not (ins in view_of and viewed_by[ins])
                    # we trac the original var, so this shouldn't happen
-                    if isinstance(ins.type, GpuArrayType):
-                        cg = 1
-                    else:
-                        cg = 0
+                    # NOTE: cg=1 was used for GPU
+                    cg = 0
                    if (
                        dependencies[ins]
                        and ins not in fgraph.outputs
@@ -1687,27 +1646,7 @@ class ProfileStats:
                )
                printed_tip = True

-        # tip 7
-        import aesara.gpuarray
-        import aesara.tensor.signal.pool as pool
-        from aesara.tensor.nnet.basic import LogSoftmax
-
-        for (fgraph, a) in self.apply_time:
-            node = a
-            if isinstance(node.op, pool.Pool):
-                if not aesara.gpuarray.dnn.dnn_present():
-                    print(
-                        "Install CuDNN to do pooling faster"
-                        "this allows the operation to run on GPU"
-                    )
-                    printed_tip = True
-            if isinstance(node.op, LogSoftmax):
-                if not aesara.gpuarray.dnn.dnn_present():
-                    print(
-                        "Install CuDNN to do LogSoftmax faster"
-                        "this allows the operation to run on GPU"
-                    )
-                    printed_tip = True
+        # tip 7 was about pool and log softmax on gpu using cudnn

        if not printed_tip:
            print("  Sorry, no tip for today.", file=file)

--- a/aesara/configdefaults.py
+++ b/aesara/configdefaults.py
@@ -292,9 +292,7 @@ def add_basic_configvars():

    config.add(
        "warn_float64",
-        "Do an action when a tensor variable with float64 dtype is"
-        " created. They can't be run on the GPU with the current(old)"
-        " gpu back-end and are slow with gamer GPUs.",
+        "Do an action when a tensor variable with float64 dtype is created.",
        EnumStr("ignore", ["warn", "raise", "pdb"]),
        in_c_key=False,
    )
@@ -326,10 +324,7 @@ def add_basic_configvars():
    config.add(
        "deterministic",
        "If `more`, sometimes we will select some implementation that "
-        "are more deterministic, but slower. In particular, on the GPU, "
-        "we will avoid using AtomicAdd. Sometimes we will still use "
-        "non-deterministic implementation, e.g. when we do not have a GPU "
-        "implementation that is deterministic. Also see "
+        "are more deterministic, but slower.  Also see "
        "the dnn.conv.algo* flags to cover more cases.",
        EnumStr("default", ["more"]),
        in_c_key=False,
@@ -405,56 +400,56 @@ def add_basic_configvars():
        in_c_key=False,
    )

-    config.add(
-        "gpuarray__preallocate",
-        """If negative it disables the allocation cache. If
-                 between 0 and 1 it enables the allocation cache and
-                 preallocates that fraction of the total GPU memory.  If 1
-                 or greater it will preallocate that amount of memory (in
-                 megabytes).""",
-        FloatParam(0, mutable=False),
-        in_c_key=False,
-    )
-
-    config.add(
-        "gpuarray__sched",
-        """The sched parameter passed for context creation to pygpu.
-                    With CUDA, using "multi" is equivalent to using the parameter
-                    cudaDeviceScheduleBlockingSync. This is useful to lower the
-                    CPU overhead when waiting for GPU. One user found that it
-                    speeds up his other processes that was doing data augmentation.
-                 """,
-        EnumStr("default", ["multi", "single"]),
-    )
-
-    config.add(
-        "gpuarray__single_stream",
-        """
-                 If your computations are mostly lots of small elements,
-                 using single-stream will avoid the synchronization
-                 overhead and usually be faster.  For larger elements it
-                 does not make a difference yet.  In the future when true
-                 multi-stream is enabled in libgpuarray, this may change.
-                 If you want to make sure to have optimal performance,
-                 check both options.
-                 """,
-        BoolParam(True),
-        in_c_key=False,
-    )
-
-    config.add(
-        "cuda__root",
-        "Location of the cuda installation",
-        StrParam(get_cuda_root),
-        in_c_key=False,
-    )
-
-    config.add(
-        "cuda__include_path",
-        "Location of the cuda includes",
-        StrParam(default_cuda_include),
-        in_c_key=False,
-    )
+    # config.add(
+    #     "gpuarray__preallocate",
+    #     """If negative it disables the allocation cache. If
+    #              between 0 and 1 it enables the allocation cache and
+    #              preallocates that fraction of the total GPU memory.  If 1
+    #              or greater it will preallocate that amount of memory (in
+    #              megabytes).""",
+    #     FloatParam(0, mutable=False),
+    #     in_c_key=False,
+    # )
+
+    # config.add(
+    #     "gpuarray__sched",
+    #     """The sched parameter passed for context creation to pygpu.
+    #                 With CUDA, using "multi" is equivalent to using the parameter
+    #                 cudaDeviceScheduleBlockingSync. This is useful to lower the
+    #                 CPU overhead when waiting for GPU. One user found that it
+    #                 speeds up his other processes that was doing data augmentation.
+    #              """,
+    #     EnumStr("default", ["multi", "single"]),
+    # )
+
+    # config.add(
+    #     "gpuarray__single_stream",
+    #     """
+    #              If your computations are mostly lots of small elements,
+    #              using single-stream will avoid the synchronization
+    #              overhead and usually be faster.  For larger elements it
+    #              does not make a difference yet.  In the future when true
+    #              multi-stream is enabled in libgpuarray, this may change.
+    #              If you want to make sure to have optimal performance,
+    #              check both options.
+    #              """,
+    #     BoolParam(True),
+    #     in_c_key=False,
+    # )
+
+    # config.add(
+    #     "cuda__root",
+    #     "Location of the cuda installation",
+    #     StrParam(get_cuda_root),
+    #     in_c_key=False,
+    # )
+
+    # config.add(
+    #     "cuda__include_path",
+    #     "Location of the cuda includes",
+    #     StrParam(default_cuda_include),
+    #     in_c_key=False,
+    # )

    # This flag determines whether or not to raise error/warning message if
    # there is a CPU Op in the computational graph.
@@ -483,103 +478,103 @@ def add_basic_configvars():
    )


-def add_dnn_configvars():
-    config.add(
-        "dnn__conv__algo_fwd",
-        "Default implementation to use for cuDNN forward convolution.",
-        EnumStr("small", SUPPORTED_DNN_CONV_ALGO_FWD),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__conv__algo_bwd_data",
-        "Default implementation to use for cuDNN backward convolution to "
-        "get the gradients of the convolution with regard to the inputs.",
-        EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_DATA),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__conv__algo_bwd_filter",
-        "Default implementation to use for cuDNN backward convolution to "
-        "get the gradients of the convolution with regard to the "
-        "filters.",
-        EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_FILTER),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__conv__precision",
-        "Default data precision to use for the computation in cuDNN "
-        "convolutions (defaults to the same dtype as the inputs of the "
-        "convolutions, or float32 if inputs are float16).",
-        EnumStr("as_input_f32", SUPPORTED_DNN_CONV_PRECISION),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__base_path",
-        "Install location of cuDNN.",
-        StrParam(default_dnn_base_path),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__include_path",
-        "Location of the cudnn header",
-        StrParam(default_dnn_inc_path),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__library_path",
-        "Location of the cudnn link library.",
-        StrParam(default_dnn_lib_path),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__bin_path",
-        "Location of the cuDNN load library "
-        "(on non-windows platforms, "
-        "this is the same as dnn__library_path)",
-        StrParam(default_dnn_bin_path),
-        in_c_key=False,
-    )
-
-    config.add(
-        "dnn__enabled",
-        "'auto', use cuDNN if available, but silently fall back"
-        " to not using it if not present."
-        " If True and cuDNN can not be used, raise an error."
-        " If False, disable cudnn even if present."
-        " If no_check, assume present and the version between header and library match (so less compilation at context init)",
-        EnumStr("auto", ["True", "False", "no_check"]),
-        in_c_key=False,
-    )
-
-
-def add_magma_configvars():
-    config.add(
-        "magma__include_path",
-        "Location of the magma header",
-        StrParam(""),
-        in_c_key=False,
-    )
-
-    config.add(
-        "magma__library_path",
-        "Location of the magma library",
-        StrParam(""),
-        in_c_key=False,
-    )
-
-    config.add(
-        "magma__enabled",
-        " If True, use magma for matrix computation." " If False, disable magma",
-        BoolParam(False),
-        in_c_key=False,
-    )
+# def add_dnn_configvars():
+#     config.add(
+#         "dnn__conv__algo_fwd",
+#         "Default implementation to use for cuDNN forward convolution.",
+#         EnumStr("small", SUPPORTED_DNN_CONV_ALGO_FWD),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__conv__algo_bwd_data",
+#         "Default implementation to use for cuDNN backward convolution to "
+#         "get the gradients of the convolution with regard to the inputs.",
+#         EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_DATA),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__conv__algo_bwd_filter",
+#         "Default implementation to use for cuDNN backward convolution to "
+#         "get the gradients of the convolution with regard to the "
+#         "filters.",
+#         EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_FILTER),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__conv__precision",
+#         "Default data precision to use for the computation in cuDNN "
+#         "convolutions (defaults to the same dtype as the inputs of the "
+#         "convolutions, or float32 if inputs are float16).",
+#         EnumStr("as_input_f32", SUPPORTED_DNN_CONV_PRECISION),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__base_path",
+#         "Install location of cuDNN.",
+#         StrParam(default_dnn_base_path),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__include_path",
+#         "Location of the cudnn header",
+#         StrParam(default_dnn_inc_path),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__library_path",
+#         "Location of the cudnn link library.",
+#         StrParam(default_dnn_lib_path),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__bin_path",
+#         "Location of the cuDNN load library "
+#         "(on non-windows platforms, "
+#         "this is the same as dnn__library_path)",
+#         StrParam(default_dnn_bin_path),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "dnn__enabled",
+#         "'auto', use cuDNN if available, but silently fall back"
+#         " to not using it if not present."
+#         " If True and cuDNN can not be used, raise an error."
+#         " If False, disable cudnn even if present."
+#         " If no_check, assume present and the version between header and library match (so less compilation at context init)",
+#         EnumStr("auto", ["True", "False", "no_check"]),
+#         in_c_key=False,
+#     )
+
+
+# def add_magma_configvars():
+#     config.add(
+#         "magma__include_path",
+#         "Location of the magma header",
+#         StrParam(""),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "magma__library_path",
+#         "Location of the magma library",
+#         StrParam(""),
+#         in_c_key=False,
+#     )
+
+#     config.add(
+#         "magma__enabled",
+#         " If True, use magma for matrix computation." " If False, disable magma",
+#         BoolParam(False),
+#         in_c_key=False,
+#     )


 def _is_gt_0(x):
@@ -682,11 +677,10 @@ def add_compile_configvars():
        if type(config).cxx.is_default:
            # If the user provided an empty value for cxx, do not warn.
            _logger.warning(
-                "g++ not detected ! Aesara will be unable to execute "
-                "optimized C-implementations (for both CPU and GPU) and will "
-                "default to Python implementations. Performance will be severely "
-                "degraded. To remove this warning, set Aesara flags cxx to an "
-                "empty string."
+                "g++ not detected!  Aesara will be unable to compile "
+                "C-implementations and will default to Python. "
+                "Performance may be severely degraded. "
+                "To remove this warning, set Aesara flags cxx to an empty string."
            )

    # Keep the default value the same as the one for the mode FAST_RUN
@@ -899,20 +893,20 @@ def add_traceback_configvars():


 def add_experimental_configvars():
-    config.add(
-        "experimental__unpickle_gpu_on_cpu",
-        "Allow unpickling of pickled GpuArrays as numpy.ndarrays."
-        "This is useful, if you want to open a GpuArray without "
-        "having cuda installed."
-        "If you have cuda installed, this will force unpickling to"
-        "be done on the cpu to numpy.ndarray."
-        "Please be aware that this may get you access to the data,"
-        "however, trying to unpicke gpu functions will not succeed."
-        "This flag is experimental and may be removed any time, when"
-        "gpu<>cpu transparency is solved.",
-        BoolParam(default=False),
-        in_c_key=False,
-    )
+    # config.add(
+    #     "experimental__unpickle_gpu_on_cpu",
+    #     "Allow unpickling of pickled GpuArrays as numpy.ndarrays."
+    #     "This is useful, if you want to open a GpuArray without "
+    #     "having cuda installed."
+    #     "If you have cuda installed, this will force unpickling to"
+    #     "be done on the cpu to numpy.ndarray."
+    #     "Please be aware that this may get you access to the data,"
+    #     "however, trying to unpicke gpu functions will not succeed."
+    #     "This flag is experimental and may be removed any time, when"
+    #     "gpu<>cpu transparency is solved.",
+    #     BoolParam(default=False),
+    #     in_c_key=False,
+    # )

    config.add(
        "experimental__local_alloc_elemwise",
@@ -1473,10 +1467,6 @@ def add_numba_configvars():
    )


-def _get_default_gpuarray__cache_path():
-    return os.path.join(config.compiledir, "gpuarray_kernels")
-
-
 def _default_compiledirname():
    formatted = config.compiledir_format % _compiledir_format_dict
    safe = re.sub(r"[\(\)\s,]+", "_", formatted)
@@ -1618,16 +1608,16 @@ def add_caching_dir_configvars():
        in_c_key=False,
    )

-    config.add(
-        "gpuarray__cache_path",
-        "Directory to cache pre-compiled kernels for the gpuarray backend.",
-        ConfigParam(
-            _get_default_gpuarray__cache_path,
-            apply=_filter_base_compiledir,
-            mutable=False,
-        ),
-        in_c_key=False,
-    )
+    # config.add(
+    #     "gpuarray__cache_path",
+    #     "Directory to cache pre-compiled kernels for the gpuarray backend.",
+    #     ConfigParam(
+    #         _get_default_gpuarray__cache_path,
+    #         apply=_filter_base_compiledir,
+    #         mutable=False,
+    #     ),
+    #     in_c_key=False,
+    # )


 # Those are the options provided by Aesara to choose algorithms at runtime.
@@ -1686,10 +1676,9 @@ config = aesara.configparser._config

 # The functions below register config variables into the config instance above.
 add_basic_configvars()
-add_dnn_configvars()
-add_magma_configvars()
+# add_dnn_configvars()
+# add_magma_configvars()
 add_compile_configvars()
-# TODO: "tensor", "gpuarray" and compilation options are closely related.. Grouping is not great.
 add_tensor_configvars()
 add_traceback_configvars()
 add_experimental_configvars()

--- a/aesara/configparser.py
+++ b/aesara/configparser.py
@@ -456,15 +456,13 @@ class DeviceParam(ConfigParam):
        )

    def _apply(self, val):
-        if val == self.default or val.startswith("opencl") or val.startswith("cuda"):
-            return val
-        elif val.startswith("gpu"):
+        if val.startswith("opencl") or val.startswith("cuda") or val.startswith("gpu"):
            raise ValueError(
                "You are trying to use the old GPU back-end. "
-                "It was removed from Aesara. Use device=cuda* now. "
-                "See https://github.com/aesara-devs/aesara/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29 "
-                "for more information."
+                "It was removed from Aesara."
            )
+        elif val == self.default:
+            return val
        else:
            raise ValueError(
                'Invalid value ("{val}") for configuration '

--- a/aesara/graph/basic.py
+++ b/aesara/graph/basic.py
@@ -229,8 +229,8 @@ class Apply(Node):
            List of `Variable` instances to use as inputs.
        strict : bool
            If ``True``, the type fields of all the inputs must be equal
-            to the current ones (or compatible, for instance `Tensor` /
-            `GpuArray` of the same dtype and broadcastable patterns,
+            to the current ones (or compatible, for instance `TensorType`
+            of the same dtype and broadcastable patterns,
            in which case they will be converted into current `Type`), and
            returned outputs are guaranteed to have the same types as
            ``self.outputs``.  If ``False``, then there's no guarantee that the
@@ -328,9 +328,6 @@ class Variable(Node):
    - `SparseVariable`: a subclass of `Variable` that represents
      a ``scipy.sparse.{csc,csr}_matrix`` object.

-    - `GpuArrayVariable`: a subclass of `Variable` that represents our object on
-      the GPU that is a subset of ``numpy.ndarray``.
-
    - `RandomVariable`.

    A `Variable` which is the output of a symbolic computation will have an owner

--- a/aesara/ifelse.py
+++ b/aesara/ifelse.py
@@ -70,9 +70,9 @@ class IfElse(_NoPythonOp):

    """

-    __props__ = ("as_view", "gpu", "n_outs")
+    __props__ = ("as_view", "n_outs")

-    def __init__(self, n_outs, as_view=False, gpu=False, name=None):
+    def __init__(self, n_outs, as_view=False, name=None):
        if as_view:
            # check destroyhandler and others to ensure that a view_map with
            # multiple inputs can work
@@ -81,7 +81,6 @@ class IfElse(_NoPythonOp):
                view_map[idx] = [idx + 1]
            self.view_map = view_map
        self.as_view = as_view
-        self.gpu = gpu
        self.n_outs = n_outs
        self.name = name

@@ -90,14 +89,12 @@ class IfElse(_NoPythonOp):
            return False
        if self.as_view != other.as_view:
            return False
-        if self.gpu != other.gpu:
-            return False
        if self.n_outs != other.n_outs:
            return False
        return True

    def __hash__(self):
-        return hash((type(self), self.as_view, self.gpu, self.n_outs))
+        return hash((type(self), self.as_view, self.n_outs))

    def __str__(self):
        args = []
@@ -105,8 +102,6 @@ class IfElse(_NoPythonOp):
            args.append(self.name)
        if self.as_view:
            args.append("inplace")
-        if self.gpu:
-            args.append("gpu")
        return f"if{{{','.join(args)}}}"

    def infer_shape(self, fgraph, node, inputs_shapes):
@@ -143,7 +138,6 @@ class IfElse(_NoPythonOp):
            new_ifelse = IfElse(
                n_outs=len(new_ts_inputs),
                as_view=False,
-                gpu=False,
                name="_".join(name_tokens),
            )
            new_outs = new_ifelse(
@@ -172,16 +166,13 @@ class IfElse(_NoPythonOp):
                f"{int(2 * self.n_outs)}, got {len(args)}"
            )
        c = at.basic.as_tensor_variable(c)
-        if not self.gpu:
-            # When gpu is true, we are given only gpuarrays, and we want
-            # to keep them as gpuarrays
-            nw_args = []
-            for x in args:
-                if isinstance(x, Variable):
-                    nw_args.append(x)
-                else:
-                    nw_args.append(at.as_tensor_variable(x))
-            args = nw_args
+        nw_args = []
+        for x in args:
+            if isinstance(x, Variable):
+                nw_args.append(x)
+            else:
+                nw_args.append(at.as_tensor_variable(x))
+        args = nw_args
        aes = args[: self.n_outs]
        fs = args[self.n_outs :]

@@ -214,13 +205,9 @@ class IfElse(_NoPythonOp):
        else:
            nw_name_t = None
            nw_name_f = None
-        if_true_op = IfElse(
-            n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_t
-        )
+        if_true_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_t)

-        if_false_op = IfElse(
-            n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_f
-        )
+        if_false_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_f)

        # The grads can have a different dtype then the inputs.
        # As inputs true/false pair must have the same dtype,
@@ -384,7 +371,7 @@ def ifelse(
            f"{len(else_branch)})"
        )

-    new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, gpu=False, name=name)
+    new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, name=name)

    ins = [condition] + list(new_then_branch) + list(new_else_branch)
    rval = new_ifelse(*ins, return_list=True)
@@ -411,7 +398,7 @@ def cond_make_inplace(fgraph, node):
            or not all(getattr(o.type, "ndim", -1) == 0 for o in node.outputs)
        )
    ):
-        return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(
+        return IfElse(n_outs=op.n_outs, as_view=True, name=op.name)(
            *node.inputs, return_list=True
        )
    return False
@@ -611,7 +598,6 @@ class CondMerge(GlobalOptimizer):
                new_ifelse = IfElse(
                    n_outs=len(mn_ts + pl_ts),
                    as_view=False,
-                    gpu=False,
                    name=mn_name + "&" + pl_name,
                )
                new_outs = new_ifelse(*new_ins, return_list=True)
@@ -660,7 +646,7 @@ def cond_remove_identical(fgraph, node):
            nw_ts.append(aes[idx])
            nw_fs.append(fs[idx])

-    new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, gpu=op.gpu, name=op.name)
+    new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, name=op.name)

    new_ins = [node.inputs[0]] + nw_ts + nw_fs
    new_outs = new_ifelse(*new_ins, return_list=True)
@@ -712,7 +698,6 @@ def cond_merge_random_op(fgraph, main_node):
            new_ifelse = IfElse(
                n_outs=len(mn_ts + pl_ts),
                as_view=False,
-                gpu=False,
                name=mn_name + "&" + pl_name,
            )
            new_outs = new_ifelse(*new_ins, return_list=True)

--- a/aesara/link/c/cmodule.py
+++ b/aesara/link/c/cmodule.py
@@ -790,9 +790,6 @@ class ModuleCache:
            if subdirs_elem == "lock_dir":
                continue
            root = os.path.join(self.dirname, subdirs_elem)
-            # Don't delete the gpuarray kernel cache
-            if root == config.gpuarray__cache_path:
-                continue
            key_pkl = os.path.join(root, "key.pkl")
            if key_pkl in self.loaded_key_pkl:
                continue

--- a/aesara/link/c/interface.py
+++ b/aesara/link/c/interface.py
@@ -496,8 +496,6 @@ class CLinkerType(CLinkerObject):
        e.g:

         - For ``TensorType(dtype='int64', ...)``: should return ``"npy_int64"``.
-         - For ``GpuArrayType(dtype='int32', ...)``: should return ``"ga_int"``.
-
        """
        return ""


--- a/aesara/link/c/params_type.py
+++ b/aesara/link/c/params_type.py
@@ -7,7 +7,7 @@ used to create a Params object that is compatible with the ParamsType defined.

 The Params object will be available in both Python code (as a standard Python object) and C code
 (as a specific struct with parameters as struct fields). To be fully-available in C code, Aesara
-types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType, GpuArrayType,
+types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType,
 or your own type. See :ref:`extending_op_params` for more details).

 Example of usage
@@ -318,9 +318,8 @@ class Params(dict):

 class ParamsType(CType):
    """
-    This class can create a struct of Aesara types (like `TensorType`,
-    `GpuArrayType`, etc.)  to be used as a convenience op parameter wrapping
-    many data.
+    This class can create a struct of Aesara types (like `TensorType`, etc.)
+    to be used as a convenience `Op` parameter wrapping many data.

    `ParamsType` constructor takes key-value args.  Key will be the name of the
    attribute in the struct.  Value is the Aesara type of this attribute,

--- a/aesara/misc/burn_gpu.py
+++ b/aesara/misc/burn_gpu.py
-"""This script trigger convolution operation. We think it cause more
-GPU power consumption then gemm call.
-
-"""
-
-
-import numpy as np
-
-import aesara
-from aesara.configdefaults import config
-from aesara.gpuarray import dnn
-from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
-from aesara.tensor.type import tensor4
-
-
-def burn():
-    sz = 128
-    img_shp = [sz, sz, sz, sz]
-    kern_shp = [sz // 2, sz, 3, 3]
-    out_shp = get_conv_output_shape(img_shp, kern_shp, "valid", (1, 1))
-    img = tensor4("img")
-    kern = tensor4("kern")
-    out = tensor4("out")
-
-    def rand(shp):
-        return np.random.rand(*shp).astype(config.floatX)
-
-    img = aesara.shared(rand(img_shp))
-    kern = aesara.shared(rand(kern_shp))
-    out = aesara.shared(rand(out_shp))
-    # beta 1 is needed to force the reuse of out, otherwise, it is
-    # replaced by a GpuAllocEmpty
-    o1 = dnn._dnn_conv(img, kern, conv_mode="conv", out=out, beta=1.0)
-    mode = aesara.compile.get_default_mode().including("local_remove_all_assert")
-    f = aesara.function([], [o1], mode=mode)
-    aesara.printing.debugprint(f)
-    print("Start computation")
-    for i in range(10000):
-        f.fn()
-    print("Computation stopped")
-
-
-if __name__ == "__main__":
-    burn()
--- a/aesara/misc/check_blas.py
+++ b/aesara/misc/check_blas.py
@@ -78,12 +78,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=

    f()  # Ignore first function call to get representative time.
    if execute:
-        try:
-            from aesara.gpuarray import GpuArraySharedVariable
-
-            sync = isinstance(c, GpuArraySharedVariable)
-        except ImportError:
-            sync = False
+        # sync was needed for gpu
+        sync = False

        if sync:
            # Make sure we don't include the time from the first call

--- a/aesara/misc/check_multi_gpu.py
+++ b/aesara/misc/check_multi_gpu.py
-#! /usr/bin/env python
-"""
-This file compare the runtime of two independent dot products on one
-and two GPU to measure the speedup.
-
-This should be 2x if the GPUs are equivalent.
-"""
-
-import threading
-import time
-
-import numpy as np
-
-import aesara
-from aesara.gpuarray import init_dev
-from aesara.gpuarray.blas import gpu_dot22
-
-
-def main(dev1, dev2):
-    init_dev(dev1, "ctx1")
-    init_dev(dev2, "ctx2")
-
-    size = 1024 * 16
-    data = np.random.randn(size, size).astype("float32")
-    val1a = aesara.shared(data, target="ctx1")
-    val1b = aesara.shared(data, target="ctx1")
-    val1c = aesara.shared(data, target="ctx1")
-    val1d = aesara.shared(data, target="ctx1")
-
-    val2a = aesara.shared(data, target="ctx2")
-    val2b = aesara.shared(data, target="ctx2")
-
-    f1 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val1c, val1d)])
-    f2 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val2a, val2b)])
-    f3 = aesara.function([], [gpu_dot22(val1a, val1b)])
-    f4 = aesara.function([], [gpu_dot22(val2a, val2b)])
-    f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")])
-    f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")])
-
-    # pre-execute to load code to GPU.
-    r = f1.fn()
-    r[0].sync(), r[1].sync()
-    r = f2.fn()
-    r[0].sync(), r[1].sync()
-    r = f3.fn()
-    r[0].sync()
-    r = f4.fn()
-    r[0].sync()
-    r = f5.fn()
-    r = f6.fn()
-    r = None
-
-    t = time.time()
-    r = f1.fn()
-    r[0].sync(), r[1].sync()
-    t2 = time.time()
-    r = None
-
-    print(f"one ctx async {t2 - t:f}")
-
-    t = time.time()
-    r = f2.fn()
-    r[0].sync(), r[1].sync()
-    t2 = time.time()
-    r = None
-
-    print(f"two ctx async {t2 - t:f}")
-
-    t = time.time()
-    r = f3.fn()
-    r2 = f4.fn()
-    r[0].sync()
-    r2[0].sync()
-    t2 = time.time()
-    r = None
-
-    print(f"two ctx, 2 fct async {t2 - t:f}")
-
-    t = time.time()
-    r = f5.fn()
-    r2 = f6.fn()
-    t2 = time.time()
-    r = None
-    print(f"two ctx, 2 fct with transfer {t2 - t:f}")
-
-    # Multi-thread version
-    class myThread(threading.Thread):
-        def __init__(self, name, f, sync):
-            threading.Thread.__init__(self)
-            self.f = f
-            self.name = name
-            self.sync = sync
-
-        def run(self):
-            # print "Starting " + self.name
-            # r = self.f.fn(n_calls=10)
-            r = self.f()
-            # print "End " + self.name
-            if self.sync:
-                r[0].sync()
-            self.r = r
-            # print "Exiting " + self.name
-
-    thread1 = myThread("Thread-3", f3, True)
-    thread2 = myThread("Thread-4", f4, True)
-    t = time.time()
-    thread1.start()
-    thread2.start()
-    thread1.join()
-    thread2.join()
-    t2 = time.time()
-
-    print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}")
-
-    thread1 = myThread("Thread-5", f5, False)
-    thread2 = myThread("Thread-6", f6, False)
-    t = time.time()
-    thread1.start()
-    thread2.start()
-    thread1.join()
-    thread2.join()
-    t2 = time.time()
-
-    print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")
-
-
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) != 3:
-        raise ValueError("This script require two device names.")
-    main(sys.argv[1], sys.argv[2])
--- a/aesara/misc/may_share_memory.py
+++ b/aesara/misc/may_share_memory.py
 """
-Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
+Function to detect memory sharing for ndarray AND sparse type.
 numpy version support only ndarray.
 """

@@ -18,48 +18,22 @@ try:
        return scipy.sparse.issparse(a)

 except ImportError:
-    # scipy not imported, their can be only ndarray and gpuarray
-    def _is_sparse(a):
-        return False
-
-
-from aesara import gpuarray
-
-
-if gpuarray.pygpu:

-    def _is_gpua(a):
-        return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
-
-else:
-
-    def _is_gpua(a):
+    def _is_sparse(a):
        return False


-__docformat__ = "restructuredtext en"
-
-
 def may_share_memory(a, b, raise_other_type=True):
    a_ndarray = isinstance(a, np.ndarray)
    b_ndarray = isinstance(b, np.ndarray)
    if a_ndarray and b_ndarray:
        return TensorType.may_share_memory(a, b)
-    a_gpua = _is_gpua(a)
-    b_gpua = _is_gpua(b)
-    if a_gpua and b_gpua:
-        return gpuarray.pygpu.gpuarray.may_share_memory(a, b)

    a_sparse = _is_sparse(a)
    b_sparse = _is_sparse(b)
-    if not (a_ndarray or a_sparse or a_gpua) or not (b_ndarray or b_sparse or b_gpua):
+    if not (a_ndarray or a_sparse) or not (b_ndarray or b_sparse):
        if raise_other_type:
-            raise TypeError(
-                "may_share_memory support only ndarray"
-                " and scipy.sparse or GpuArray type"
-            )
+            raise TypeError("may_share_memory support only ndarray" " and scipy.sparse")
        return False

-    if a_gpua or b_gpua:
-        return False
    return SparseTensorType.may_share_memory(a, b)
--- a/aesara/misc/pkl_utils.py
+++ b/aesara/misc/pkl_utils.py
@@ -9,7 +9,6 @@ import os
 import pickle
 import sys
 import tempfile
-import warnings
 import zipfile
 from collections import defaultdict
 from contextlib import closing
@@ -27,7 +26,6 @@ except ImportError:
    DEFAULT_PROTOCOL = HIGHEST_PROTOCOL

 from aesara.compile.sharedvalue import SharedVariable
-from aesara.configdefaults import config


 __docformat__ = "restructuredtext en"
@@ -121,30 +119,7 @@ class PersistentNdarrayID:
            return self.seen[id(obj)]


-class PersistentGpuArrayID(PersistentNdarrayID):
-    def __call__(self, obj):
-        from aesara.gpuarray.type import _name_for_ctx
-
-        try:
-            import pygpu
-        except ImportError:
-            pygpu = None
-
-        if pygpu and isinstance(obj, pygpu.gpuarray.GpuArray):
-            if id(obj) not in self.seen:
-
-                def write_array(f):
-                    pickle.dump(_name_for_ctx(obj.context), f, 2)
-                    np.lib.format.write_array(f, np.asarray(obj))
-
-                name = self._resolve_name(obj)
-                zipadd(write_array, self.zip_file, name)
-                self.seen[id(obj)] = f"gpuarray.{name}"
-            return self.seen[id(obj)]
-        return super().__call__(obj)
-
-
-class PersistentSharedVariableID(PersistentGpuArrayID):
+class PersistentSharedVariableID(PersistentNdarrayID):
    """Uses shared variable names when persisting to zip file.

    If a shared variable has a name, this name is used as the name of the
@@ -213,32 +188,16 @@ class PersistentNdarrayLoad:
        self.cache = {}

    def __call__(self, persid):
-        from aesara.gpuarray import pygpu
-        from aesara.gpuarray.type import get_context
-
        array_type, name = persid.split(".")
-
+        del array_type
+        # array_type was used for switching gpu/cpu arrays
+        # it is better to put these into sublclasses properly
+        # this is more work but better logic
        if name in self.cache:
            return self.cache[name]
        ret = None
-        if array_type == "gpuarray":
-            with self.zip_file.open(name) as f:
-                ctx_name = pickle.load(f)
-                array = np.lib.format.read_array(f)
-            if config.experimental__unpickle_gpu_on_cpu:
-                # directly return numpy array
-                warnings.warn(
-                    "config.experimental__unpickle_gpu_on_cpu is set "
-                    "to True. Unpickling GpuArray as numpy.ndarray"
-                )
-                ret = array
-            elif pygpu:
-                ret = pygpu.array(array, context=get_context(ctx_name))
-            else:
-                raise ImportError("pygpu not found. Cannot unpickle GpuArray")
-        else:
-            with self.zip_file.open(name) as f:
-                ret = np.lib.format.read_array(f)
+        with self.zip_file.open(name) as f:
+            ret = np.lib.format.read_array(f)
        self.cache[name] = ret
        return ret


--- a/aesara/scan/basic.py
+++ b/aesara/scan/basic.py
@@ -12,7 +12,7 @@ from aesara.graph.op import get_test_value
 from aesara.graph.utils import MissingInputError, TestValueError
 from aesara.scan import utils
 from aesara.scan.op import Scan, ScanInfo
-from aesara.scan.utils import safe_new, traverse
+from aesara.scan.utils import safe_new
 from aesara.tensor.exceptions import NotScalarConstantError
 from aesara.tensor.math import minimum
 from aesara.tensor.shape import shape_padleft
@@ -968,29 +968,8 @@ def scan(
    )
    if condition is not None:
        inner_outs.append(condition)
-    # gpuarray is imported here, instead of being imported on top of
-    # the file because that would force on the user some dependencies that we
-    # might do not want to. Currently we are working on removing the
-    # dependencies on sandbox code completely.
-    from aesara import gpuarray
-
-    if gpuarray.pygpu_activated:
-        # very often we end up in this situation when we want to
-        # replace w with w_copy, where w is a GPU variable
-        # and w_copy is TensorType. This is caused because shared
-        # variables are put on GPU right away >:| ,
-        new_givens = OrderedDict()
-
-        for w, w_copy in givens.items():
-            if isinstance(w.type, gpuarray.GpuArrayType) and isinstance(
-                w_copy.type, TensorType
-            ):
-                for o in inner_outs:
-                    new_givens = traverse(o, w, w_copy, new_givens)
-            else:
-                new_givens[w] = w_copy
-    else:
-        new_givens = givens
+    # NOTE: legacy code traversed GPU types
+    new_givens = givens

    new_outs = clone_replace(inner_outs, replace=new_givens)

@@ -1023,7 +1002,6 @@ def scan(
        mode=mode,
        truncate_gradient=truncate_gradient,
        name=name,
-        gpua=False,
        as_while=as_while,
        profile=profile,
        allow_gc=allow_gc,

--- a/aesara/scan/c_code/scan_perform.c
+++ b/aesara/scan/c_code/scan_perform.c
--- a/aesara/scan/op.py
+++ b/aesara/scan/op.py
@@ -180,8 +180,7 @@ def check_broadcast(v1, v2):
 def copy_var_format(var, as_var):
    """
    This functions ensures that ``var`` has the same dtype as ``as_var`` as
-    well as calling `filter_variable` to make sure they are both `TensorType`
-    or `GpuArrayType`.
+    well as calling `filter_variable` to make sure they are both `TensorType`.

    It internally deals with the corner case where ``inp.ndim + 1 = out.ndim``.

@@ -549,32 +548,6 @@ class ScanMethodsMixin:
                        f"type '{type_input}' and '{type_output}' respectively."
                    )

-        # If scan has the flag 'gpua' set to false (meaning that is shouldn't
-        # use the gpuarray gpu backend ), ensure that is has no input and no
-        # output with type GpuArrayType
-        from aesara.gpuarray import GpuArrayType
-
-        if not self.gpua:
-            for inp in self.inputs:
-                if isinstance(inp.type, GpuArrayType):
-                    raise TypeError(
-                        "Inconsistency in the inner graph of "
-                        f"scan '{self.name}' : one of the inputs to the "
-                        "inner graph is of type GpuArrayType but "
-                        "the attributes of the scan op indicate "
-                        "that it shouldn't be the case"
-                    )
-
-            for out in self.outputs:
-                if isinstance(out.type, GpuArrayType):
-                    raise TypeError(
-                        "Inconsistency in the inner graph of "
-                        f"scan '{self.name}' : one of the outputs to the "
-                        "inner graph is of type GpuArrayType but "
-                        "the attributes of the scan op indicate "
-                        "that it shouldn't be the case"
-                    )
-

 class Scan(Op, ScanMethodsMixin, HasInnerGraph):
    r"""An `Op` implementing `for` and `while` loops.
@@ -616,7 +589,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
        typeConstructor: Optional[TensorConstructorType] = None,
        truncate_gradient: bool = False,
        name: Optional[str] = None,
-        gpua: bool = False,
        as_while: bool = False,
        profile: Optional[Union[str, bool]] = None,
        allow_gc: bool = True,
@@ -666,8 +638,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
            as well as profiles for the computation of one step of each instance of
            `Scan`. The `name` of the instance appears in those profiles and can
            greatly help to disambiguate information.
-        gpua
-            If ``True``, this `Op` should run on a GPU.
        as_while
            Whether or not the `Scan` is a ``while``-loop.
        profile
@@ -690,34 +660,15 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
            speed up allocation of the subsequent iterations. All those temporary
            allocations are freed at the end of all iterations; this is what the
            flag `aesara.config.allow_gc` means.
-
-            If you use pre-allocation and this `Scan` is on GPU, the speed up from
-            `allow_gc` is small. If you are missing memory, disabling `allow_gc`
-            could help you run graph that request much memory.
        strict
            If ``True``, all the shared variables used in the inner-graph must be provided.

-        Notes
-        -----
-        `typeConstructor` had been added to refactor how Aesara deals with the
-        GPU. If it runs on the GPU, `Scan` needs to construct certain outputs
-        (those that reside in GPU memory) as the GPU-specific `Type`.  Since we
-        cannot import GPU code here, the GPU optimizations pass the constructor
-        of this class a function that is able to construct a GPU `Type`. This
-        way the class `Scan` does not need to be aware of the GPU details--it
-        simply constructs tensors using this function (which by default
-        constructs normal tensors).
-
-        TODO: Clean up this approach and everything else related to GPUs; it's
-        all currently a very leaky set of abstractions.
-
        """
        self.inputs = inputs
        self.outputs = outputs
        self.info = info
        self.truncate_gradient = truncate_gradient
        self.name = name
-        self.gpua = gpua
        self.as_while = as_while
        self.profile = profile
        self.allow_gc = allow_gc
@@ -789,17 +740,14 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
        self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
        self.n_tap_outs = self.n_mit_mot + self.n_mit_sot

-        if self.gpua:
-            self._hash_inner_graph = self.gpu_hash
-        else:
-            # Do the missing inputs check here to have the error early.
-            for var in graph_inputs(self.outputs, self.inputs):
-                if var not in self.inputs and not isinstance(var, Constant):
-                    raise MissingInputError(f"ScanOp is missing an input: {repr(var)}")
-            self._cmodule_key = CLinker().cmodule_key_variables(
-                self.inputs, self.outputs, []
-            )
-            self._hash_inner_graph = hash(self._cmodule_key)
+        # Do the missing inputs check here to have the error early.
+        for var in graph_inputs(self.outputs, self.inputs):
+            if var not in self.inputs and not isinstance(var, Constant):
+                raise MissingInputError(f"ScanOp is missing an input: {repr(var)}")
+        self._cmodule_key = CLinker().cmodule_key_variables(
+            self.inputs, self.outputs, []
+        )
+        self._hash_inner_graph = hash(self._cmodule_key)

        (
            self.preallocated_mitmot_outs,
@@ -1185,9 +1133,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
        if self.info != other.info:
            return False

-        if self.gpua != other.gpua:
-            return False
-
        if self.as_while != other.as_while:
            return False

@@ -1220,10 +1165,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
        )

    def __str__(self):
-        if self.gpua:
-            gpu_str = "gpu"
-        else:
-            gpu_str = "cpu"
+        device_str = "cpu"
        if self.as_while:
            name = "do_while"
        else:
@@ -1242,7 +1184,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                aux_txt += "},%s,%s}"
        else:
            aux_txt += "{%s,%s}"
-        aux_txt = aux_txt % (name, gpu_str, str(self.name))
+        aux_txt = aux_txt % (name, device_str, str(self.name))
        return aux_txt

    def __hash__(self):
@@ -1251,7 +1193,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                type(self),
                self._hash_inner_graph,
                self.info,
-                self.gpua,
                self.as_while,
                self.profile,
                self.truncate_gradient,
@@ -1418,9 +1359,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):

        # Analyse the compile inner function to determine which inputs and
        # outputs are on the gpu and speed up some checks during the execution
-        inps_is_tensor = [
-            isinstance(out, TensorVariable) for out in self.fn.maker.fgraph.inputs
-        ]
        outs_is_tensor = [
            isinstance(out, TensorVariable) for out in self.fn.maker.fgraph.outputs
        ]
@@ -1441,7 +1379,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                self.mitmots_preallocated, dtype="int32"
            )

-            cython_inps_is_tensor = np.asarray(inps_is_tensor, dtype="int32")
            cython_outs_is_tensor = np.asarray(outs_is_tensor, dtype="int32")

            if self.destroy_map:
@@ -1499,7 +1436,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                        cython_vector_outs,
                        self.mit_mot_out_slices,
                        cython_mitmots_preallocated,
-                        cython_inps_is_tensor,
                        cython_outs_is_tensor,
                        inner_input_storage,
                        inner_output_storage,
@@ -1762,7 +1698,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                pdx = offset + self.n_shared_outs
                inner_output_storage[pdx].storage[0] = None

-            # 4.5. Keep a reference to the variables (ndarrays, GpuArrays,
+            # 4.5. Keep a reference to the variables (ndarrays,
            # etc) currently in the output_storage to be able to compare them
            # with the actual outputs of the inner function after its
            # execution. Also keep pointers to their data to be able to detect
@@ -1778,9 +1714,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                elif isinstance(self.fn.maker.fgraph.outputs[idx], TensorVariable):
                    old_inner_output_data[idx] = var.data
                else:
-                    old_inner_output_data[idx] = var.gpudata
+                    raise RuntimeError("old_inner_output_data[idx] = var.gpudata")

-            # 4.6. Keep a reference to the variables (ndarrays, GpuArrays,
+            # 4.6. Keep a reference to the variables (ndarrays,
            # etc) associated with mitmot inputs currently in the
            # input_storage to be able to compare them with the content of the
            # input_storage after the execution of the function. Also keep
@@ -1793,12 +1729,8 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):

                if var is None:
                    old_mitmot_input_data[idx] = None
-                elif isinstance(
-                    self.fn.maker.fgraph.inputs[idx + self.n_seqs], TensorVariable
-                ):
-                    old_mitmot_input_data[idx] = var.data
                else:
-                    old_mitmot_input_data[idx] = var.gpudata
+                    old_mitmot_input_data[idx] = var.data

            # 5.1 compute outputs
            t0_fn = time.time()
@@ -1865,13 +1797,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                        new_var = inner_input_storage[self.n_seqs + inp_idx].storage[0]
                        if old_var is new_var:
                            old_data = old_mitmot_input_data[inp_idx]
-                            if isinstance(
-                                self.fn.maker.fgraph.inputs[self.n_seqs + inp_idx],
-                                TensorVariable,
-                            ):
-                                same_data = new_var.data == old_data
-                            else:
-                                same_data = new_var.gpudata == old_data
+                            same_data = new_var.data == old_data
                        else:
                            same_data = False

@@ -1922,7 +1848,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                        ):
                            output_reused = new_var.data == old_data
                        else:
-                            output_reused = new_var.gpudata == old_data
+                            raise RuntimeError(
+                                "output_reused = new_var.gpudata == old_data"
+                            )
                    else:
                        output_reused = False

@@ -1986,7 +1914,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
                        ):
                            output_reused = new_var.data == old_data
                        else:
-                            output_reused = new_var.gpudata == old_data
+                            raise RuntimeError(
+                                "output_reused = new_var.gpudata == old_data"
+                            )
                    else:
                        output_reused = False

@@ -2888,7 +2818,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
            info,
            mode=self.mode,
            truncate_gradient=self.truncate_gradient,
-            gpua=False,
            as_while=False,
            profile=self.profile,
            name=f"grad_of_{self.name}" if self.name else None,
@@ -3219,7 +3148,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
            inner_outs,
            info,
            mode=self.mode,
-            gpua=False,
            as_while=self.as_while,
            profile=self.profile,
            truncate_gradient=self.truncate_gradient,

--- a/aesara/scan/opt.py
+++ b/aesara/scan/opt.py
@@ -176,7 +176,6 @@ def remove_constants_and_unused_inputs_scan(fgraph, node):
            op_outs,
            nw_info,
            mode=op.mode,
-            gpua=op.gpua,
            as_while=op.as_while,
            profile=op.profile,
            truncate_gradient=op.truncate_gradient,
@@ -341,7 +340,6 @@ def push_out_non_seq_scan(fgraph, node):
            op_outs,
            op.info,
            mode=op.mode,
-            gpua=op.gpua,
            as_while=op.as_while,
            profile=op.profile,
            truncate_gradient=op.truncate_gradient,
@@ -591,7 +589,6 @@ def push_out_seq_scan(fgraph, node):
            op_outs,
            nw_info,
            mode=op.mode,
-            gpua=op.gpua,
            as_while=op.as_while,
            profile=op.profile,
            truncate_gradient=op.truncate_gradient,
@@ -758,7 +755,6 @@ def add_nitsot_outputs(
        new_scan_args.inner_outputs,
        new_scan_args.info,
        mode=old_scan_node.op.mode,
-        gpua=old_scan_node.op.gpua,
        as_while=old_scan_node.op.as_while,
        profile=old_scan_node.op.profile,
        truncate_gradient=old_scan_node.op.truncate_gradient,
@@ -909,10 +905,9 @@ class ScanInplaceOptimizer(GlobalOptimizer):

    """

-    def __init__(self, typeInfer=None, gpua_flag=False):
+    def __init__(self, typeInfer=None):
        super().__init__()
        self.typeInfer = typeInfer
-        self.gpua_flag = gpua_flag

    def add_requirements(self, fgraph):
        fgraph.attach_feature(ReplaceValidate())
@@ -984,7 +979,6 @@ class ScanInplaceOptimizer(GlobalOptimizer):
            op.info,
            mode=op.mode,
            typeConstructor=typeConstructor,
-            gpua=op.gpua,
            as_while=op.as_while,
            profile=op.profile,
            truncate_gradient=op.truncate_gradient,
@@ -1016,9 +1010,7 @@ class ScanInplaceOptimizer(GlobalOptimizer):

        alloc_ops = (Alloc, AllocEmpty)
        nodes = fgraph.toposort()[::-1]
-        scan_nodes = [
-            x for x in nodes if (isinstance(x.op, Scan) and x.op.gpua == self.gpua_flag)
-        ]
+        scan_nodes = [x for x in nodes if (isinstance(x.op, Scan))]
        for scan_idx in range(len(scan_nodes)):

            # First attempt to make the Scan compute inplace every recurrent
@@ -1515,7 +1507,6 @@ def save_mem_new_scan(fgraph, node):
            outs,
            info,
            mode=op.mode,
-            gpua=op.gpua,
            as_while=op.as_while,
            profile=op.profile,
            truncate_gradient=op.truncate_gradient,
@@ -1812,7 +1803,6 @@ class ScanMerge(GlobalOptimizer):
            truncate_gradient=old_op.truncate_gradient,
            allow_gc=old_op.allow_gc,
            name="&".join([nd.op.name for nd in nodes]),
-            gpua=False,
            as_while=as_while,
        )
        new_outs = new_op(*outer_ins)
@@ -1989,7 +1979,6 @@ def scan_merge_inouts(fgraph, node):
            inner_outputs,
            info,
            mode=node.op.mode,
-            gpua=node.op.gpua,
            as_while=node.op.as_while,
            profile=node.op.profile,
            truncate_gradient=node.op.truncate_gradient,
@@ -2255,7 +2244,6 @@ def push_out_dot1_scan(fgraph, node):
                        new_inner_outs,
                        new_info,
                        mode=op.mode,
-                        gpua=op.gpua,
                        as_while=op.as_while,
                        profile=op.profile,
                        truncate_gradient=op.truncate_gradient,

--- a/aesara/scan/scan_perform.pyx
+++ b/aesara/scan/scan_perform.pyx
@@ -78,7 +78,6 @@ def perform(
        numpy.ndarray[numpy.int32_t,ndim=1] vector_outs,
        tuple mit_mot_out_slices,
        numpy.ndarray[numpy.int32_t,ndim=1] mitmots_preallocated,
-        numpy.ndarray[numpy.int32_t,ndim=1] inps_is_tensor,
        numpy.ndarray[numpy.int32_t,ndim=1] outs_is_tensor,
        list inner_input_storage,
        list inner_output_storage,
@@ -132,9 +131,6 @@ def perform(
        tensor, 0 otherwise.
    mit_mot_out_slices
        Same as tap_array, but for the output taps of mit_mot sequences
-    inps_is_tensor : int32 ndarray (Can be replaced by a list)
-        Array of boolean indicating, for every input, whether it is a tensor
-        or not
    outs_is_tensor : int32 ndarray (Can be replaced by a list)
        Array of boolean indicating, for every output, whether it is a tensor
        or not
@@ -359,7 +355,7 @@ def perform(
            pdx = offset + n_shared_outs
            inner_output_storage[<unsigned int>pdx][0] = None

-        # 4.5. Keep a reference to the variables (ndarrays, GpuArrays,
+        # 4.5. Keep a reference to the variables (ndarrays,
        # etc) currently in the inner_output_storage to be able to compare them
        # with the actual outputs of the inner function after its
        # execution. Also keep pointers to their data to be able to detect
@@ -372,12 +368,10 @@ def perform(

            if var is None:
                old_output_data[idx] = None
-            elif outs_is_tensor[idx]:
-                old_output_data[idx] = var.data
            else:
-                old_output_data[idx] = var.gpudata
+                old_output_data[idx] = var.data

-        # 4.6. Keep a reference to the variables (ndarrays, GpuArrays,
+        # 4.6. Keep a reference to the variables (ndarrays,
        # etc) associated with mitmot inputs currently in the inner_input_storage to
        # be able to compare them with the content of the inner_input_storage after
        # the execution of the function. Also keep pointers to their data to
@@ -389,10 +383,8 @@ def perform(

            if var is None:
                old_mitmot_input_data[idx] = None
-            elif inps_is_tensor[idx + n_seqs]:
-                old_mitmot_input_data[idx] = var.data
            else:
-                old_mitmot_input_data[idx] = var.gpudata
+                old_mitmot_input_data[idx] = var.data

        # 5.1 compute outputs
        t0_fn = time.time()
@@ -436,10 +428,7 @@ def perform(
                    new_var = inner_input_storage[n_seqs + inp_idx][0]
                    if old_var is new_var:
                        old_data = old_mitmot_input_data[inp_idx]
-                        if inps_is_tensor[n_seqs + inp_idx]:
-                            same_data = (new_var.data == old_data)
-                        else:
-                            same_data = (new_var.gpudata == old_data)
+                        same_data = (new_var.data == old_data)
                    else:
                        same_data = False

@@ -480,10 +469,8 @@ def perform(
                if old_var is new_var:
                    if old_data is None:
                        output_reused = False
-                    elif outs_is_tensor[offset_out + j]:
-                        output_reused = (new_var.data == old_data)
                    else:
-                        output_reused = (new_var.gpudata == old_data)
+                        output_reused = (new_var.data == old_data)
                else:
                    output_reused = False

@@ -520,10 +507,8 @@ def perform(
                if old_var is new_var:
                    if old_data is None:
                        output_reused = False
-                    elif outs_is_tensor[offset_out + j]:
-                        output_reused = (new_var.data == old_data)
                    else:
-                        output_reused = (new_var.gpudata == old_data)
+                        output_reused = (new_var.data == old_data)
                else:
                    output_reused = False


--- a/aesara/scan/utils.py
+++ b/aesara/scan/utils.py
@@ -192,11 +192,6 @@ def traverse(out, x, x_copy, d, visited=None):

    There are two options :
        1) x and x_copy or on host, then you would replace x with x_copy
-        2) x is on gpu, x_copy on host, then you need to replace
-        host_from_gpu(x) with x_copy
-    This happens because initially shared variables are on GPU... which is
-    fine for the main computational graph but confuses things a bit for the
-    inner graph of scan.

    """
    # ``visited`` is a set of nodes that are already known and don't need to be
@@ -208,19 +203,14 @@ def traverse(out, x, x_copy, d, visited=None):
    if out in visited:
        return d
    visited.add(out)
-    from aesara.gpuarray import pygpu_activated
-    from aesara.gpuarray.basic_ops import GpuFromHost, host_from_gpu
-    from aesara.gpuarray.type import GpuArrayType

    if out == x:
-        assert isinstance(x.type, GpuArrayType)
-        d[out] = GpuFromHost(x.type.context_name)(x_copy)
-        return d
+        # assert isinstance(x.type, GpuArrayType)
+        # d[out] = GpuFromHost(x.type.context_name)(x_copy)
+        # return d
+        raise RuntimeError("Not supported")
    elif out.owner is None:
        return d
-    elif pygpu_activated and out.owner.op == host_from_gpu and out.owner.inputs == [x]:
-        d[out] = at.as_tensor_variable(x_copy)
-        return d
    else:
        for inp in out.owner.inputs:
            d = traverse(inp, x, x_copy, d, visited)

--- a/aesara/tensor/blas.py
+++ b/aesara/tensor/blas.py
@@ -15,7 +15,6 @@ There are four kinds of BLAS Ops in Aesara:
    - Python implementations (this file)
    - SciPy-based (blas_scipy)
    - C-based (blas_c)
-    - GPU-based (aesara.gpuarray)

 Notes
 -----

--- a/aesara/tensor/subtensor.py
+++ b/aesara/tensor/subtensor.py
@@ -865,7 +865,7 @@ class Subtensor(COp):
    ):
        """
        The parameters c_prefix are there to allow reusing this
-        function on PyArray and GpuArray object.
+        function on PyArray object.

        This fct take as input the x.

@@ -1581,9 +1581,7 @@ class IncSubtensor(COp):
        # This method delegates much of the work to helper
        # methods. This method implements the main logic
        # but subclasses may override the helper methods
-        # to change the particulars, e.g. GpuIncSubtensor
-        # turns the view/copy operations on numpy arrays
-        # into the same operations on gpu arrays.
+        # to change the particulars.

        self.do_type_checking(node)


--- a/environment.yml
+++ b/environment.yml
@@ -23,9 +23,6 @@ dependencies:
  # numba backend
  - numba>=0.55
  - numba-scipy
-  # GPU
-  - libgpuarray
-  - pygpu
  # For testing
  - coveralls
  - diff-cover

--- a/setup.cfg
+++ b/setup.cfg
@@ -17,12 +17,6 @@ per-file-ignores =
    tests/sparse/test_utils.py:E402,F401
    tests/sparse/sandbox/test_sp.py:E402,F401
    tests/scalar/test_basic_sympy.py:E402
-    tests/gpuarray/test_type.py:E402
-    tests/gpuarray/test_abstractconv.py:E402
-    tests/gpuarray/test_dnn.py:E402
-    tests/gpuarray/test_elemwise.py:E402
-    tests/gpuarray/test_others.py:E402
-    tests/gpuarray/test_basic_ops.py:E402
    aesara/graph/unify.py:F811
 exclude =
    versioneer.py
@@ -32,7 +26,6 @@ exclude =
 [coverage:run]
 omit =
    aesara/_version.py
-    aesara/gpuarray/*
    tests/*
    aesara/assert_op.py
    aesara/link/jax/jax_linker.py
@@ -45,7 +38,6 @@ relative_files = true
 [coverage:report]
 omit =
    aesara/_version.py
-    aesara/gpuarray/*
    tests/*
 exclude_lines =
    pragma: no cover
@@ -111,10 +103,6 @@ check_untyped_defs = False
 ignore_errors = True
 check_untyped_defs = False

-[mypy-aesara.gpuarray.*]
-ignore_errors = True
-check_untyped_defs = False
-
 [mypy-aesara.compile.mode]
 ignore_errors = True
 check_untyped_defs = False

--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,6 @@ if __name__ == "__main__":
                "symbolic",
                "blas",
                "numpy",
-                "gpu",
                "autodiff",
                "differentiation",
            ]

--- a/tests/compile/function/test_types.py
+++ b/tests/compile/function/test_types.py
 import copy
 import pickle
-import time

 import numpy as np
 import pytest

-import aesara.gpuarray
 import aesara.tensor as at
 from aesara.compile import shared
 from aesara.compile.debugmode import DebugMode, InvalidValueError
@@ -14,8 +12,6 @@ from aesara.compile.function.types import UnusedInputError
 from aesara.compile.io import In, Out
 from aesara.compile.mode import Mode, get_default_mode
 from aesara.configdefaults import config
-from aesara.gpuarray import gpuarray_shared_constructor
-from aesara.gpuarray.blas import GpuGemm
 from aesara.graph.basic import Constant
 from aesara.graph.opt import OpKeyOptimizer, PatternSub
 from aesara.graph.utils import MissingInputError
@@ -1146,76 +1142,3 @@ def test_empty_givens_updates():
    y = x * 2
    function([In(x)], y, givens={})
    function([In(x)], y, updates={})
-
-
-@pytest.mark.skipif(
-    not aesara.gpuarray.pygpu_activated or config.mode == "DEBUG_MODE",
-    reason="DEBUG_MODE forces synchronous behaviour which breaks this test",
-)
-def test_sync_update():
-    # This test if sync_update work. This can only be tested when
-    # there is a GPU.  To test if we really sync, we compare a case we
-    # can run in parallel GPU and CPU computation. Then we sync to
-    # disable that parallel computation. Then we assert the time is
-    # higher.
-
-    # this import needs to go first because it generates the
-    # local 'aesara' variable.  You get an UnboundLocalError otherwise.
-    import tests.gpuarray.config
-
-    sizes = [100, 500, 1000, 2000, 5000, 10000, 20000, 40000]
-    size = sizes[0]
-    w = gpuarray_shared_constructor(
-        np.random.rand(size, size).astype("float32"),
-        "w",
-        target=tests.gpuarray.config.test_ctx_name,
-    )
-    x = gpuarray_shared_constructor(
-        np.random.rand(size, size).astype("float32"),
-        "x",
-        target=tests.gpuarray.config.test_ctx_name,
-    )
-
-    updates = [(w, w + np.asarray(0.001, "float32") * dot(x, x))]
-
-    f = function([], updates=updates, mode=tests.gpuarray.config.mode_with_gpu)
-    assert len(f.maker.fgraph.apply_nodes) == 1
-    assert any(isinstance(n.op, GpuGemm) for n in f.maker.fgraph.apply_nodes)
-    # Make sure libgpuarray have compile all kernels
-    f()
-    f.sync_shared()
-
-    # Find a good size that will take about .5s.
-    # This is to make the test more stable across different GPUs.
-    size = sizes[-1]
-    for i in sizes:
-        data = np.random.rand(i, i).astype("float32")
-        w.set_value(data)
-        x.set_value(data)
-        t0 = time.time()
-        f()
-        f.sync_shared()
-        t1 = time.time()
-        if (t1 - t0) < 0.5:
-            continue
-        size = i
-        break
-    # sync to make sure all computation are done
-    f.sync_shared()
-
-    t_0 = time.time()
-    for i in range(3):
-        f()
-        # Sync after each call to see the slowdown from sync.
-        f.sync_shared()
-        time.sleep(0.5)
-    t_1 = time.time()
-    for i in range(3):
-        f()
-        time.sleep(0.5)
-    f.sync_shared()
-    # Sync to make sure all computation are finished.
-    t_2 = time.time()
-    d1 = t_1 - t_0
-    d2 = t_2 - t_1
-    assert d1 > d2, (d1, d2)
--- a/tests/tensor/test_sharedvar.py
+++ b/tests/tensor/test_sharedvar.py
@@ -243,7 +243,6 @@ def makeSharedTester(
                assert x is not get_x
            assert np.allclose(self.ref_fct(np.asarray(x_orig) / 0.5), self.ref_fct(x))

-            # test optimized get set value on the gpu(don't pass data to the cpu)
            get_x = x_shared.get_value(borrow=True, return_internal_type=True)
            assert get_x is not x_orig  # borrow=False to shared_constructor
            assert self.check_internal_type(get_x)
@@ -325,8 +324,6 @@ def makeSharedTester(
            if x.__class__.__name__ != "csr_matrix":
                # sparse matrix don't support inplace affectation
                nd += 1
-                # THIS DOESN'T DO WHAT WE EXPECT the content of a is
-                # not updated for GpuArray, but it is for ndarray
                x_shared.get_value(borrow=True)[:] = nd
                assert may_share_memory(old_data, x_shared.container.storage[0])
                x_shared.get_value(borrow=True)
@@ -345,7 +342,6 @@ def makeSharedTester(
            )

            # Test by set_value with borrow=False when new data cast.
-            # specifically useful for gpu data
            nd += 1
            old_data = x_shared.container.storage[0]
            x_shared.set_value(self.cast_value(nd), borrow=False)
@@ -522,8 +518,7 @@ def makeSharedTester(
                assert (
                    sum(
                        [
-                            node.op.__class__.__name__
-                            in ["Gemm", "GpuGemm", "StructuredDot"]
+                            node.op.__class__.__name__ in ["Gemm", "StructuredDot"]
                            for node in topo
                        ]
                    )
@@ -534,11 +529,6 @@ def makeSharedTester(
                    for node in topo
                    if isinstance(node.op, aesara.tensor.blas.Gemm)
                )
-                assert all(
-                    node.op.inplace
-                    for node in topo
-                    if node.op.__class__.__name__ == "GpuGemm"
-                )
            # Their is no inplace gemm for sparse
            # assert all(node.op.inplace for node in topo if node.op.__class__.__name__ == "StructuredDot")
            s_shared_specify = specify_shape(
@@ -560,8 +550,7 @@ def makeSharedTester(
                assert (
                    sum(
                        [
-                            node.op.__class__.__name__
-                            in ["Gemm", "GpuGemm", "StructuredDot"]
+                            node.op.__class__.__name__ in ["Gemm", "StructuredDot"]
                            for node in topo
                        ]
                    )
@@ -572,11 +561,7 @@ def makeSharedTester(
                    for node in topo
                    if isinstance(node.op, aesara.tensor.blas.Gemm)
                )
-                assert all(
-                    node.op.inplace
-                    for node in topo
-                    if node.op.__class__.__name__ == "GpuGemm"
-                )
+
            # now test with the specify shape op in the inputs and outputs
            a_shared = specify_shape(a_shared, a_shared.get_value(borrow=True).shape)
            b_shared = specify_shape(b_shared, b_shared.get_value(borrow=True).shape)
@@ -595,8 +580,7 @@ def makeSharedTester(
                assert (
                    sum(
                        [
-                            node.op.__class__.__name__
-                            in ["Gemm", "GpuGemm", "StructuredDot"]
+                            node.op.__class__.__name__ in ["Gemm", "StructuredDot"]
                            for node in topo
                        ]
                    )
@@ -607,11 +591,6 @@ def makeSharedTester(
                    for node in topo
                    if isinstance(node.op, aesara.tensor.blas.Gemm)
                )
-                assert all(
-                    node.op.inplace
-                    for node in topo
-                    if node.op.__class__.__name__ == "GpuGemm"
-                )

        if (
            aesara.config.cycle_detection == "fast"

--- a/tests/test_ifelse.py
+++ b/tests/test_ifelse.py
@@ -150,8 +150,6 @@ class TestIfelse(utt.OptimizationTestMixin):
        f = function(
            [c, x, y], [self.cast_output(gx), self.cast_output(gy)], mode=self.mode
        )
-        # There is only 2 of the 3 ifelse that are moved on the GPU.
-        # The one that stay on the CPU is for the shape.
        self.assertFunctionContains(f, self.get_ifelse(1), min=2, max=3)
        rng = np.random.default_rng(utt.fetch_seed())

@@ -173,7 +171,6 @@ class TestIfelse(utt.OptimizationTestMixin):
        assert np.all(np.asarray(gy0) == 1.0)

    def test_grad_cast_input(self):
-        # Tests the gradient when both inputs are on the GPU.
        x = vector("x", dtype=self.dtype)
        y = vector("y", dtype=self.dtype)
        c = iscalar("c")
@@ -528,8 +525,7 @@ class TestIfelse(utt.OptimizationTestMixin):
        assert str(res.owner).startswith("if{}")
        res.owner.op.name = "name"
        res.owner.op.as_view = True
-        res.owner.op.gpu = True
-        assert str(res.owner).startswith("if{name,inplace,gpu}")
+        assert str(res.owner).startswith("if{name,inplace}")


 class IfElseIfElseIf(Op):