Remove gpuarray dependencies throughout the codebase

0e3182d1 · Maxim Kochurov · Brandon T. Willard · 2a5fc594 · 0e3182d1 · 0e3182d1
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,6 @@ repos:
              aesara/compile/nanguardmode\.py|
              aesara/graph/opt\.py|
              aesara/tensor/var\.py|
-              aesara/gpuarray/opt\.py
          )$
      - id: check-merge-conflict
  - repo: https://github.com/psf/black

--- a/DESCRIPTION.txt
+++ b/DESCRIPTION.txt
 Aesara is a Python library that allows you to define, optimize, and efficiently evaluate mathematical expressions involving multi-dimensional arrays. It is built on top of NumPy_. Aesara features:
 * **tight integration with NumPy:** a similar interface to NumPy's. numpy.ndarrays are also used internally in Aesara-compiled functions.
- * **transparent use of a GPU:** perform data-intensive computations up to 140x faster than on a CPU (support for float32 only).
 * **efficient symbolic differentiation:** Aesara can compute derivatives for functions of one or many inputs.
 * **speed and stability optimizations:** avoid nasty bugs when computing expressions such as log(1 + exp(x)) for large values of x.
 * **dynamic C code generation:** evaluate expressions faster.

--- a/aesara/__init__.py
+++ b/aesara/__init__.py
@@ -144,16 +144,6 @@ from aesara.updates import OrderedUpdates
 # isort: on
-if (
-    config.device.startswith("cuda")
-    or config.device.startswith("opencl")
-    or config.init_gpu_device.startswith("cuda")
-    or config.init_gpu_device.startswith("opencl")
-    or config.contexts != ""
-):
-    import aesara.gpuarray
 def get_scalar_constant_value(v):
    """Return the constant scalar (i.e. 0-D) value underlying variable `v`.

--- a/aesara/compile/debugmode.py
+++ b/aesara/compile/debugmode.py
@@ -752,16 +752,6 @@ def _get_preallocated_maps(
    Preallocate outputs in different memory layouts.
    """
-    # To avoid circular imports
-    from aesara.gpuarray import GpuArrayType
-    from aesara.tensor.type import TensorType
-    try:
-        import pygpu
-    except ImportError:
-        pass
    # TODO: Sparse? Scalar does not really make sense.
    # Do not preallocate memory for outputs that actually work inplace
@@ -795,11 +785,12 @@ def _get_preallocated_maps(
            # I'm not sure why it is legitimate, but there are tests about it.
            # So, we cannot fill r_vals[r] with def_val yet, we have to wait
            # until all output values are deepcopied.
+        from aesara.tensor import TensorType
        for r in considered_outputs:
            # There is no risk to overwrite inputs, since r does not work
            # inplace.
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                reuse_outputs[r][...] = np.asarray(def_val).astype(r.type.dtype)
        if reuse_outputs:
@@ -812,7 +803,7 @@ def _get_preallocated_maps(
    if "c_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
        c_cont_outputs = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                # Build a C-contiguous buffer
                new_buf = r.type.value_zeros(r_vals[r].shape)
                assert new_buf.flags["C_CONTIGUOUS"]
@@ -829,13 +820,11 @@ def _get_preallocated_maps(
    if "f_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
        f_cont_outputs = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                new_buf = np.zeros(
                    shape=r_vals[r].shape, dtype=r_vals[r].dtype, order="F"
                )
                new_buf[...] = def_val
-                if isinstance(r.type, GpuArrayType):
-                    new_buf = pygpu.array(new_buf)
                f_cont_outputs[r] = new_buf
@@ -859,7 +848,7 @@ def _get_preallocated_maps(
        max_ndim = 0
        rev_out_broadcastable = []
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                if max_ndim < r.ndim:
                    rev_out_broadcastable += [True] * (r.ndim - max_ndim)
                    max_ndim = r.ndim
@@ -874,7 +863,7 @@ def _get_preallocated_maps(
        # Initial allocation
        init_strided = {}
        for r in considered_outputs:
-            if isinstance(r.type, (TensorType, GpuArrayType)):
+            if isinstance(r.type, TensorType):
                # Create a buffer twice as large in every dimension,
                # except if broadcastable, or for dimensions above
                # config.DebugMode__check_preallocated_output_ndim
@@ -953,7 +942,7 @@ def _get_preallocated_maps(
                name = f"wrong_size{tuple(shape_diff)}"
                for r in considered_outputs:
-                    if isinstance(r.type, (TensorType, GpuArrayType)):
+                    if isinstance(r.type, TensorType):
                        r_shape_diff = shape_diff[: r.ndim]
                        out_shape = [
                            max((s + sd), 0)

--- a/aesara/compile/function/types.py
+++ b/aesara/compile/function/types.py
@@ -1097,13 +1097,8 @@ class Function:
        return [i.variable for i in self.maker.inputs if i.implicit]
    def sync_shared(self):
-        if hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated:
+        # sync was needed on old gpu backend
-            import pygpu
+        pass
-            for i in self.maker.fgraph.update_mapping.values():
-                inp = self.input_storage[i]
-                if isinstance(inp.data, pygpu.gpuarray.GpuArray):
-                    inp.data.sync()
 # pickling/deepcopy support for Function

--- a/aesara/compile/nanguardmode.py
+++ b/aesara/compile/nanguardmode.py
@@ -5,24 +5,11 @@ from io import StringIO
 import numpy as np
 import aesara
-from aesara.compile.mode import Mode, get_mode
+from aesara.compile.mode import Mode
 from aesara.configdefaults import config
-from aesara.tensor.math import abs as at_abs
-from aesara.tensor.math import max as at_max
-from aesara.tensor.math import min as at_min
 from aesara.tensor.type import discrete_dtypes
-try:
-    from pygpu.gpuarray import GpuArray
-    from aesara.gpuarray.type import GpuArrayType, _name_for_ctx
-    pygpu_available = True
-except ImportError:
-    pygpu_available = False
 logger = logging.getLogger("aesara.compile.nanguardmode")
@@ -114,9 +101,6 @@ def contains_nan(arr, node=None, var=None):
        return False
    elif getattr(arr, "dtype", "") in discrete_dtypes:
        return False
-    elif pygpu_available and isinstance(arr, GpuArray):
-        return np.isnan(f_gpua_min(arr.reshape(arr.size)))
    return np.isnan(np.min(arr))
@@ -149,36 +133,9 @@ def contains_inf(arr, node=None, var=None):
        return False
    elif getattr(arr, "dtype", "") in discrete_dtypes:
        return False
-    elif pygpu_available and isinstance(arr, GpuArray):
-        return np.isinf(f_gpua_min(arr.reshape(arr.size))) or np.isinf(
-            f_gpua_max(arr.reshape(arr.size))
-        )
    return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
-def f_compute(op):
-    def result(inp):
-        dtype = inp.dtype
-        ctx_name = _name_for_ctx(inp.context)
-        key = (dtype, ctx_name)
-        f = result.cache.get(key, None)
-        if f is None:
-            guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
-            mode = get_mode("FAST_RUN").including("gpuarray")
-            f = aesara.function([guard_in], op(guard_in), mode=mode, profile=False)
-            result.cache[key] = f
-        return f(inp)
-    result.cache = dict()
-    return result
-f_gpua_min = f_compute(at_min)
-f_gpua_max = f_compute(at_max)
-f_gpua_absmax = f_compute(lambda x: at_max(at_abs(x)))
 class NanGuardMode(Mode):
    """
    A Aesara compilation Mode that makes the compiled function automatically
@@ -252,8 +209,6 @@ class NanGuardMode(Mode):
                err = False
                if not _is_numeric_value(value, var):
                    err = False
-                elif pygpu_available and isinstance(value, GpuArray):
-                    err = f_gpua_absmax(value.reshape(value.size)) > 1e10
                else:
                    err = np.abs(value).max() > 1e10
                if err:

--- a/aesara/compile/profiling.py
+++ b/aesara/compile/profiling.py
@@ -12,10 +12,8 @@ import atexit
 import copy
 import logging
 import operator
-import os
 import sys
 import time
-import warnings
 from collections import defaultdict
 from typing import Dict, List
@@ -279,40 +277,7 @@ class ProfileStats:
    # param is called flag_time_thunks because most other attributes with time
    # in the name are times *of* something, rather than configuration flags.
-    def __init__(
+    def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
-        self, atexit_print=True, flag_time_thunks=None, gpu_checks=True, **kwargs
-    ):
-        if (
-            gpu_checks
-            and (hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated)
-            and os.environ.get("CUDA_LAUNCH_BLOCKING", "0") != "1"
-        ):
-            msg = (
-                "You are running the Aesara profiler with CUDA enabled."
-                " Aesara GPU ops execution is asynchronous by default."
-                " So by default, the profile is useless."
-                " You must set the environment variable"
-                " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
-                " synchronize the execution to get a meaningful profile."
-            )
-            if config.profile:
-                raise Exception(msg)
-            else:
-                warnings.warn(msg)
-        if (
-            config.profile
-            and gpu_checks
-            and hasattr(aesara, "gpuarray")
-            and aesara.gpuarray.pygpu_activated
-            and not config.profiling__ignore_first_call
-        ):
-            warnings.warn(
-                "Aesara flag profiling__ignore_first_call is False. "
-                "This cause bad profiling result in the gpu "
-                "back-end, as sometimes we compile at the first call."
-            )
        self.apply_callcount = {}
        self.output_size = {}
        # Keys are `(FunctionGraph, Variable)`
@@ -543,8 +508,8 @@ class ProfileStats:
            tot += t
            ftot = tot * 100 / local_time
            # Remove the useless start and end of the class name:
-            # "<class 'aesara.gpuarray.blas.GpuDot22'>" ->
+            # "<class 'aesara.backend.blas.GpuDot22'>" ->
-            #  "aesara.gpuarray.blas.GpuDot22"
+            #  "aesara.backend.blas.GpuDot22"
            class_name = str(a)[8:-2][:maxlen]
            print(
                format_str
@@ -922,8 +887,6 @@ class ProfileStats:
                new allocation.
            """
-            from aesara.gpuarray import GpuArrayType
            # Initial Mem info values [CPU, GPU]
            node_memory_size = [0, 0]
            running_memory_size = [0, 0]
@@ -973,10 +936,8 @@ class ProfileStats:
                # allocated by the node
                idx2 = 0
                for out in node.outputs:
-                    if isinstance(out.type, GpuArrayType):
+                    # NOTE: cg=1 was used for GPU
-                        cg = 1
+                    cg = 0
-                    else:
-                        cg = 0
                    ins = None
                    if dmap and idx2 in dmap:
                        vidx = dmap[idx2]
@@ -1021,10 +982,8 @@ class ProfileStats:
                for ins in set(node.inputs):
                    assert not (ins in view_of and viewed_by[ins])
                    # we trac the original var, so this shouldn't happen
-                    if isinstance(ins.type, GpuArrayType):
+                    # NOTE: cg=1 was used for GPU
-                        cg = 1
+                    cg = 0
-                    else:
-                        cg = 0
                    if (
                        dependencies[ins]
                        and ins not in fgraph.outputs
@@ -1687,27 +1646,7 @@ class ProfileStats:
                )
                printed_tip = True
-        # tip 7
+        # tip 7 was about pool and log softmax on gpu using cudnn
-        import aesara.gpuarray
-        import aesara.tensor.signal.pool as pool
-        from aesara.tensor.nnet.basic import LogSoftmax
-        for (fgraph, a) in self.apply_time:
-            node = a
-            if isinstance(node.op, pool.Pool):
-                if not aesara.gpuarray.dnn.dnn_present():
-                    print(
-                        "Install CuDNN to do pooling faster"
-                        "this allows the operation to run on GPU"
-                    )
-                    printed_tip = True
-            if isinstance(node.op, LogSoftmax):
-                if not aesara.gpuarray.dnn.dnn_present():
-                    print(
-                        "Install CuDNN to do LogSoftmax faster"
-                        "this allows the operation to run on GPU"
-                    )
-                    printed_tip = True
        if not printed_tip:
            print("  Sorry, no tip for today.", file=file)

--- a/aesara/configdefaults.py
+++ b/aesara/configdefaults.py
--- a/aesara/configparser.py
+++ b/aesara/configparser.py
@@ -456,15 +456,13 @@ class DeviceParam(ConfigParam):
        )
    def _apply(self, val):
-        if val == self.default or val.startswith("opencl") or val.startswith("cuda"):
+        if val.startswith("opencl") or val.startswith("cuda") or val.startswith("gpu"):
-            return val
-        elif val.startswith("gpu"):
            raise ValueError(
                "You are trying to use the old GPU back-end. "
-                "It was removed from Aesara. Use device=cuda* now. "
+                "It was removed from Aesara."
-                "See https://github.com/aesara-devs/aesara/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29 "
-                "for more information."
            )
+        elif val == self.default:
+            return val
        else:
            raise ValueError(
                'Invalid value ("{val}") for configuration '

--- a/aesara/graph/basic.py
+++ b/aesara/graph/basic.py
@@ -229,8 +229,8 @@ class Apply(Node):
            List of `Variable` instances to use as inputs.
        strict : bool
            If ``True``, the type fields of all the inputs must be equal
-            to the current ones (or compatible, for instance `Tensor` /
+            to the current ones (or compatible, for instance `TensorType`
-            `GpuArray` of the same dtype and broadcastable patterns,
+            of the same dtype and broadcastable patterns,
            in which case they will be converted into current `Type`), and
            returned outputs are guaranteed to have the same types as
            ``self.outputs``.  If ``False``, then there's no guarantee that the
@@ -328,9 +328,6 @@ class Variable(Node):
    - `SparseVariable`: a subclass of `Variable` that represents
      a ``scipy.sparse.{csc,csr}_matrix`` object.
-    - `GpuArrayVariable`: a subclass of `Variable` that represents our object on
-      the GPU that is a subset of ``numpy.ndarray``.
    - `RandomVariable`.
    A `Variable` which is the output of a symbolic computation will have an owner

--- a/aesara/ifelse.py
+++ b/aesara/ifelse.py
@@ -70,9 +70,9 @@ class IfElse(_NoPythonOp):
    """
-    __props__ = ("as_view", "gpu", "n_outs")
+    __props__ = ("as_view", "n_outs")
-    def __init__(self, n_outs, as_view=False, gpu=False, name=None):
+    def __init__(self, n_outs, as_view=False, name=None):
        if as_view:
            # check destroyhandler and others to ensure that a view_map with
            # multiple inputs can work
@@ -81,7 +81,6 @@ class IfElse(_NoPythonOp):
                view_map[idx] = [idx + 1]
            self.view_map = view_map
        self.as_view = as_view
-        self.gpu = gpu
        self.n_outs = n_outs
        self.name = name
@@ -90,14 +89,12 @@ class IfElse(_NoPythonOp):
            return False
        if self.as_view != other.as_view:
            return False
-        if self.gpu != other.gpu:
-            return False
        if self.n_outs != other.n_outs:
            return False
        return True
    def __hash__(self):
-        return hash((type(self), self.as_view, self.gpu, self.n_outs))
+        return hash((type(self), self.as_view, self.n_outs))
    def __str__(self):
        args = []
@@ -105,8 +102,6 @@ class IfElse(_NoPythonOp):
            args.append(self.name)
        if self.as_view:
            args.append("inplace")
-        if self.gpu:
-            args.append("gpu")
        return f"if{{{','.join(args)}}}"
    def infer_shape(self, fgraph, node, inputs_shapes):
@@ -143,7 +138,6 @@ class IfElse(_NoPythonOp):
            new_ifelse = IfElse(
                n_outs=len(new_ts_inputs),
                as_view=False,
-                gpu=False,
                name="_".join(name_tokens),
            )
            new_outs = new_ifelse(
@@ -172,16 +166,13 @@ class IfElse(_NoPythonOp):
                f"{int(2 * self.n_outs)}, got {len(args)}"
            )
        c = at.basic.as_tensor_variable(c)
-        if not self.gpu:
+        nw_args = []
-            # When gpu is true, we are given only gpuarrays, and we want
+        for x in args:
-            # to keep them as gpuarrays
+            if isinstance(x, Variable):
-            nw_args = []
+                nw_args.append(x)
-            for x in args:
+            else:
-                if isinstance(x, Variable):
+                nw_args.append(at.as_tensor_variable(x))
-                    nw_args.append(x)
+        args = nw_args
-                else:
-                    nw_args.append(at.as_tensor_variable(x))
-            args = nw_args
        aes = args[: self.n_outs]
        fs = args[self.n_outs :]
@@ -214,13 +205,9 @@ class IfElse(_NoPythonOp):
        else:
            nw_name_t = None
            nw_name_f = None
-        if_true_op = IfElse(
+        if_true_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_t)
-            n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_t
-        )
-        if_false_op = IfElse(
+        if_false_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_f)
-            n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_f
-        )
        # The grads can have a different dtype then the inputs.
        # As inputs true/false pair must have the same dtype,
@@ -384,7 +371,7 @@ def ifelse(
            f"{len(else_branch)})"
        )
-    new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, gpu=False, name=name)
+    new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, name=name)
    ins = [condition] + list(new_then_branch) + list(new_else_branch)
    rval = new_ifelse(*ins, return_list=True)
@@ -411,7 +398,7 @@ def cond_make_inplace(fgraph, node):
            or not all(getattr(o.type, "ndim", -1) == 0 for o in node.outputs)
        )
    ):
-        return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(
+        return IfElse(n_outs=op.n_outs, as_view=True, name=op.name)(
            *node.inputs, return_list=True
        )
    return False
@@ -611,7 +598,6 @@ class CondMerge(GlobalOptimizer):
                new_ifelse = IfElse(
                    n_outs=len(mn_ts + pl_ts),
                    as_view=False,
-                    gpu=False,
                    name=mn_name + "&" + pl_name,
                )
                new_outs = new_ifelse(*new_ins, return_list=True)
@@ -660,7 +646,7 @@ def cond_remove_identical(fgraph, node):
            nw_ts.append(aes[idx])
            nw_fs.append(fs[idx])
-    new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, gpu=op.gpu, name=op.name)
+    new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, name=op.name)
    new_ins = [node.inputs[0]] + nw_ts + nw_fs
    new_outs = new_ifelse(*new_ins, return_list=True)
@@ -712,7 +698,6 @@ def cond_merge_random_op(fgraph, main_node):
            new_ifelse = IfElse(
                n_outs=len(mn_ts + pl_ts),
                as_view=False,
-                gpu=False,
                name=mn_name + "&" + pl_name,
            )
            new_outs = new_ifelse(*new_ins, return_list=True)

--- a/aesara/link/c/cmodule.py
+++ b/aesara/link/c/cmodule.py
@@ -790,9 +790,6 @@ class ModuleCache:
            if subdirs_elem == "lock_dir":
                continue
            root = os.path.join(self.dirname, subdirs_elem)
-            # Don't delete the gpuarray kernel cache
-            if root == config.gpuarray__cache_path:
-                continue
            key_pkl = os.path.join(root, "key.pkl")
            if key_pkl in self.loaded_key_pkl:
                continue

--- a/aesara/link/c/interface.py
+++ b/aesara/link/c/interface.py
@@ -496,8 +496,6 @@ class CLinkerType(CLinkerObject):
        e.g:
         - For ``TensorType(dtype='int64', ...)``: should return ``"npy_int64"``.
-         - For ``GpuArrayType(dtype='int32', ...)``: should return ``"ga_int"``.
        """
        return ""

--- a/aesara/link/c/params_type.py
+++ b/aesara/link/c/params_type.py
@@ -7,7 +7,7 @@ used to create a Params object that is compatible with the ParamsType defined.
 The Params object will be available in both Python code (as a standard Python object) and C code
 (as a specific struct with parameters as struct fields). To be fully-available in C code, Aesara
-types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType, GpuArrayType,
+types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType,
 or your own type. See :ref:`extending_op_params` for more details).
 Example of usage
@@ -318,9 +318,8 @@ class Params(dict):
 class ParamsType(CType):
    """
-    This class can create a struct of Aesara types (like `TensorType`,
+    This class can create a struct of Aesara types (like `TensorType`, etc.)
-    `GpuArrayType`, etc.)  to be used as a convenience op parameter wrapping
+    to be used as a convenience `Op` parameter wrapping many data.
-    many data.
    `ParamsType` constructor takes key-value args.  Key will be the name of the
    attribute in the struct.  Value is the Aesara type of this attribute,

--- a/aesara/misc/burn_gpu.py
+++ b/aesara/misc/burn_gpu.py
-"""This script trigger convolution operation. We think it cause more
-GPU power consumption then gemm call.
-"""
-import numpy as np
-import aesara
-from aesara.configdefaults import config
-from aesara.gpuarray import dnn
-from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
-from aesara.tensor.type import tensor4
-def burn():
-    sz = 128
-    img_shp = [sz, sz, sz, sz]
-    kern_shp = [sz // 2, sz, 3, 3]
-    out_shp = get_conv_output_shape(img_shp, kern_shp, "valid", (1, 1))
-    img = tensor4("img")
-    kern = tensor4("kern")
-    out = tensor4("out")
-    def rand(shp):
-        return np.random.rand(*shp).astype(config.floatX)
-    img = aesara.shared(rand(img_shp))
-    kern = aesara.shared(rand(kern_shp))
-    out = aesara.shared(rand(out_shp))
-    # beta 1 is needed to force the reuse of out, otherwise, it is
-    # replaced by a GpuAllocEmpty
-    o1 = dnn._dnn_conv(img, kern, conv_mode="conv", out=out, beta=1.0)
-    mode = aesara.compile.get_default_mode().including("local_remove_all_assert")
-    f = aesara.function([], [o1], mode=mode)
-    aesara.printing.debugprint(f)
-    print("Start computation")
-    for i in range(10000):
-        f.fn()
-    print("Computation stopped")
-if __name__ == "__main__":
-    burn()
--- a/aesara/misc/check_blas.py
+++ b/aesara/misc/check_blas.py
@@ -78,12 +78,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
    f()  # Ignore first function call to get representative time.
    if execute:
-        try:
+        # sync was needed for gpu
-            from aesara.gpuarray import GpuArraySharedVariable
+        sync = False
-            sync = isinstance(c, GpuArraySharedVariable)
-        except ImportError:
-            sync = False
        if sync:
            # Make sure we don't include the time from the first call

--- a/aesara/misc/check_multi_gpu.py
+++ b/aesara/misc/check_multi_gpu.py
-#! /usr/bin/env python
-"""
-This file compare the runtime of two independent dot products on one
-and two GPU to measure the speedup.
-This should be 2x if the GPUs are equivalent.
-"""
-import threading
-import time
-import numpy as np
-import aesara
-from aesara.gpuarray import init_dev
-from aesara.gpuarray.blas import gpu_dot22
-def main(dev1, dev2):
-    init_dev(dev1, "ctx1")
-    init_dev(dev2, "ctx2")
-    size = 1024 * 16
-    data = np.random.randn(size, size).astype("float32")
-    val1a = aesara.shared(data, target="ctx1")
-    val1b = aesara.shared(data, target="ctx1")
-    val1c = aesara.shared(data, target="ctx1")
-    val1d = aesara.shared(data, target="ctx1")
-    val2a = aesara.shared(data, target="ctx2")
-    val2b = aesara.shared(data, target="ctx2")
-    f1 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val1c, val1d)])
-    f2 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val2a, val2b)])
-    f3 = aesara.function([], [gpu_dot22(val1a, val1b)])
-    f4 = aesara.function([], [gpu_dot22(val2a, val2b)])
-    f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")])
-    f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")])
-    # pre-execute to load code to GPU.
-    r = f1.fn()
-    r[0].sync(), r[1].sync()
-    r = f2.fn()
-    r[0].sync(), r[1].sync()
-    r = f3.fn()
-    r[0].sync()
-    r = f4.fn()
-    r[0].sync()
-    r = f5.fn()
-    r = f6.fn()
-    r = None
-    t = time.time()
-    r = f1.fn()
-    r[0].sync(), r[1].sync()
-    t2 = time.time()
-    r = None
-    print(f"one ctx async {t2 - t:f}")
-    t = time.time()
-    r = f2.fn()
-    r[0].sync(), r[1].sync()
-    t2 = time.time()
-    r = None
-    print(f"two ctx async {t2 - t:f}")
-    t = time.time()
-    r = f3.fn()
-    r2 = f4.fn()
-    r[0].sync()
-    r2[0].sync()
-    t2 = time.time()
-    r = None
-    print(f"two ctx, 2 fct async {t2 - t:f}")
-    t = time.time()
-    r = f5.fn()
-    r2 = f6.fn()
-    t2 = time.time()
-    r = None
-    print(f"two ctx, 2 fct with transfer {t2 - t:f}")
-    # Multi-thread version
-    class myThread(threading.Thread):
-        def __init__(self, name, f, sync):
-            threading.Thread.__init__(self)
-            self.f = f
-            self.name = name
-            self.sync = sync
-        def run(self):
-            # print "Starting " + self.name
-            # r = self.f.fn(n_calls=10)
-            r = self.f()
-            # print "End " + self.name
-            if self.sync:
-                r[0].sync()
-            self.r = r
-            # print "Exiting " + self.name
-    thread1 = myThread("Thread-3", f3, True)
-    thread2 = myThread("Thread-4", f4, True)
-    t = time.time()
-    thread1.start()
-    thread2.start()
-    thread1.join()
-    thread2.join()
-    t2 = time.time()
-    print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}")
-    thread1 = myThread("Thread-5", f5, False)
-    thread2 = myThread("Thread-6", f6, False)
-    t = time.time()
-    thread1.start()
-    thread2.start()
-    thread1.join()
-    thread2.join()
-    t2 = time.time()
-    print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) != 3:
-        raise ValueError("This script require two device names.")
-    main(sys.argv[1], sys.argv[2])
--- a/aesara/misc/may_share_memory.py
+++ b/aesara/misc/may_share_memory.py
 """
-Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
+Function to detect memory sharing for ndarray AND sparse type.
 numpy version support only ndarray.
 """
@@ -18,48 +18,22 @@ try:
        return scipy.sparse.issparse(a)
 except ImportError:
-    # scipy not imported, their can be only ndarray and gpuarray
-    def _is_sparse(a):
-        return False
-from aesara import gpuarray
-if gpuarray.pygpu:
-    def _is_gpua(a):
+    def _is_sparse(a):
-        return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
-else:
-    def _is_gpua(a):
        return False
-__docformat__ = "restructuredtext en"
 def may_share_memory(a, b, raise_other_type=True):
    a_ndarray = isinstance(a, np.ndarray)
    b_ndarray = isinstance(b, np.ndarray)
    if a_ndarray and b_ndarray:
        return TensorType.may_share_memory(a, b)
-    a_gpua = _is_gpua(a)
-    b_gpua = _is_gpua(b)
-    if a_gpua and b_gpua:
-        return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
    a_sparse = _is_sparse(a)
    b_sparse = _is_sparse(b)
-    if not (a_ndarray or a_sparse or a_gpua) or not (b_ndarray or b_sparse or b_gpua):
+    if not (a_ndarray or a_sparse) or not (b_ndarray or b_sparse):
        if raise_other_type:
-            raise TypeError(
+            raise TypeError("may_share_memory support only ndarray" " and scipy.sparse")
-                "may_share_memory support only ndarray"
-                " and scipy.sparse or GpuArray type"
-            )
        return False
-    if a_gpua or b_gpua:
-        return False
    return SparseTensorType.may_share_memory(a, b)
--- a/aesara/misc/pkl_utils.py
+++ b/aesara/misc/pkl_utils.py
@@ -9,7 +9,6 @@ import os
 import pickle
 import sys
 import tempfile
-import warnings
 import zipfile
 from collections import defaultdict
 from contextlib import closing
@@ -27,7 +26,6 @@ except ImportError:
    DEFAULT_PROTOCOL = HIGHEST_PROTOCOL
 from aesara.compile.sharedvalue import SharedVariable
-from aesara.configdefaults import config
 __docformat__ = "restructuredtext en"
@@ -121,30 +119,7 @@ class PersistentNdarrayID:
            return self.seen[id(obj)]
-class PersistentGpuArrayID(PersistentNdarrayID):
+class PersistentSharedVariableID(PersistentNdarrayID):
-    def __call__(self, obj):
-        from aesara.gpuarray.type import _name_for_ctx
-        try:
-            import pygpu
-        except ImportError:
-            pygpu = None
-        if pygpu and isinstance(obj, pygpu.gpuarray.GpuArray):
-            if id(obj) not in self.seen:
-                def write_array(f):
-                    pickle.dump(_name_for_ctx(obj.context), f, 2)
-                    np.lib.format.write_array(f, np.asarray(obj))
-                name = self._resolve_name(obj)
-                zipadd(write_array, self.zip_file, name)
-                self.seen[id(obj)] = f"gpuarray.{name}"
-            return self.seen[id(obj)]
-        return super().__call__(obj)
-class PersistentSharedVariableID(PersistentGpuArrayID):
    """Uses shared variable names when persisting to zip file.
    If a shared variable has a name, this name is used as the name of the
@@ -213,32 +188,16 @@ class PersistentNdarrayLoad:
        self.cache = {}
    def __call__(self, persid):
-        from aesara.gpuarray import pygpu
-        from aesara.gpuarray.type import get_context
        array_type, name = persid.split(".")
+        del array_type
+        # array_type was used for switching gpu/cpu arrays
+        # it is better to put these into sublclasses properly
+        # this is more work but better logic
        if name in self.cache:
            return self.cache[name]
        ret = None
-        if array_type == "gpuarray":
+        with self.zip_file.open(name) as f:
-            with self.zip_file.open(name) as f:
+            ret = np.lib.format.read_array(f)
-                ctx_name = pickle.load(f)
-                array = np.lib.format.read_array(f)
-            if config.experimental__unpickle_gpu_on_cpu:
-                # directly return numpy array
-                warnings.warn(
-                    "config.experimental__unpickle_gpu_on_cpu is set "
-                    "to True. Unpickling GpuArray as numpy.ndarray"
-                )
-                ret = array
-            elif pygpu:
-                ret = pygpu.array(array, context=get_context(ctx_name))
-            else:
-                raise ImportError("pygpu not found. Cannot unpickle GpuArray")
-        else:
-            with self.zip_file.open(name) as f:
-                ret = np.lib.format.read_array(f)
        self.cache[name] = ret
        return ret

--- a/aesara/scan/basic.py
+++ b/aesara/scan/basic.py
@@ -12,7 +12,7 @@ from aesara.graph.op import get_test_value
 from aesara.graph.utils import MissingInputError, TestValueError
 from aesara.scan import utils
 from aesara.scan.op import Scan, ScanInfo
-from aesara.scan.utils import safe_new, traverse
+from aesara.scan.utils import safe_new
 from aesara.tensor.exceptions import NotScalarConstantError
 from aesara.tensor.math import minimum
 from aesara.tensor.shape import shape_padleft
@@ -968,29 +968,8 @@ def scan(
    )
    if condition is not None:
        inner_outs.append(condition)
-    # gpuarray is imported here, instead of being imported on top of
+    # NOTE: legacy code traversed GPU types
-    # the file because that would force on the user some dependencies that we
+    new_givens = givens
-    # might do not want to. Currently we are working on removing the
-    # dependencies on sandbox code completely.
-    from aesara import gpuarray
-    if gpuarray.pygpu_activated:
-        # very often we end up in this situation when we want to
-        # replace w with w_copy, where w is a GPU variable
-        # and w_copy is TensorType. This is caused because shared
-        # variables are put on GPU right away >:| ,
-        new_givens = OrderedDict()
-        for w, w_copy in givens.items():
-            if isinstance(w.type, gpuarray.GpuArrayType) and isinstance(
-                w_copy.type, TensorType
-            ):
-                for o in inner_outs:
-                    new_givens = traverse(o, w, w_copy, new_givens)
-            else:
-                new_givens[w] = w_copy
-    else:
-        new_givens = givens
    new_outs = clone_replace(inner_outs, replace=new_givens)
@@ -1023,7 +1002,6 @@ def scan(
        mode=mode,
        truncate_gradient=truncate_gradient,
        name=name,
-        gpua=False,
        as_while=as_while,
        profile=profile,
        allow_gc=allow_gc,

--- a/aesara/scan/c_code/scan_perform.c
+++ b/aesara/scan/c_code/scan_perform.c
--- a/aesara/scan/op.py
+++ b/aesara/scan/op.py
--- a/aesara/scan/opt.py
+++ b/aesara/scan/opt.py
--- a/aesara/scan/scan_perform.pyx
+++ b/aesara/scan/scan_perform.pyx
--- a/aesara/scan/utils.py
+++ b/aesara/scan/utils.py
--- a/aesara/tensor/blas.py
+++ b/aesara/tensor/blas.py
--- a/aesara/tensor/subtensor.py
+++ b/aesara/tensor/subtensor.py
--- a/environment.yml
+++ b/environment.yml
--- a/setup.cfg
+++ b/setup.cfg
--- a/setup.py
+++ b/setup.py
--- a/tests/compile/function/test_types.py
+++ b/tests/compile/function/test_types.py
--- a/tests/tensor/test_sharedvar.py
+++ b/tests/tensor/test_sharedvar.py
--- a/tests/test_ifelse.py
+++ b/tests/test_ifelse.py