提交 0e3182d1 authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove gpuarray dependencies throughout the codebase

上级 2a5fc594
......@@ -17,7 +17,6 @@ repos:
aesara/compile/nanguardmode\.py|
aesara/graph/opt\.py|
aesara/tensor/var\.py|
aesara/gpuarray/opt\.py
)$
- id: check-merge-conflict
- repo: https://github.com/psf/black
......
Aesara is a Python library that allows you to define, optimize, and efficiently evaluate mathematical expressions involving multi-dimensional arrays. It is built on top of NumPy_. Aesara features:
* **tight integration with NumPy:** a similar interface to NumPy's. numpy.ndarrays are also used internally in Aesara-compiled functions.
* **transparent use of a GPU:** perform data-intensive computations up to 140x faster than on a CPU (support for float32 only).
* **efficient symbolic differentiation:** Aesara can compute derivatives for functions of one or many inputs.
* **speed and stability optimizations:** avoid nasty bugs when computing expressions such as log(1 + exp(x)) for large values of x.
* **dynamic C code generation:** evaluate expressions faster.
......
......@@ -144,16 +144,6 @@ from aesara.updates import OrderedUpdates
# isort: on
if (
config.device.startswith("cuda")
or config.device.startswith("opencl")
or config.init_gpu_device.startswith("cuda")
or config.init_gpu_device.startswith("opencl")
or config.contexts != ""
):
import aesara.gpuarray
def get_scalar_constant_value(v):
"""Return the constant scalar (i.e. 0-D) value underlying variable `v`.
......
......@@ -752,16 +752,6 @@ def _get_preallocated_maps(
Preallocate outputs in different memory layouts.
"""
# To avoid circular imports
from aesara.gpuarray import GpuArrayType
from aesara.tensor.type import TensorType
try:
import pygpu
except ImportError:
pass
# TODO: Sparse? Scalar does not really make sense.
# Do not preallocate memory for outputs that actually work inplace
......@@ -795,11 +785,12 @@ def _get_preallocated_maps(
# I'm not sure why it is legitimate, but there are tests about it.
# So, we cannot fill r_vals[r] with def_val yet, we have to wait
# until all output values are deepcopied.
from aesara.tensor import TensorType
for r in considered_outputs:
# There is no risk to overwrite inputs, since r does not work
# inplace.
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
reuse_outputs[r][...] = np.asarray(def_val).astype(r.type.dtype)
if reuse_outputs:
......@@ -812,7 +803,7 @@ def _get_preallocated_maps(
if "c_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
c_cont_outputs = {}
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
# Build a C-contiguous buffer
new_buf = r.type.value_zeros(r_vals[r].shape)
assert new_buf.flags["C_CONTIGUOUS"]
......@@ -829,13 +820,11 @@ def _get_preallocated_maps(
if "f_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
f_cont_outputs = {}
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
new_buf = np.zeros(
shape=r_vals[r].shape, dtype=r_vals[r].dtype, order="F"
)
new_buf[...] = def_val
if isinstance(r.type, GpuArrayType):
new_buf = pygpu.array(new_buf)
f_cont_outputs[r] = new_buf
......@@ -859,7 +848,7 @@ def _get_preallocated_maps(
max_ndim = 0
rev_out_broadcastable = []
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim
......@@ -874,7 +863,7 @@ def _get_preallocated_maps(
# Initial allocation
init_strided = {}
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
# Create a buffer twice as large in every dimension,
# except if broadcastable, or for dimensions above
# config.DebugMode__check_preallocated_output_ndim
......@@ -953,7 +942,7 @@ def _get_preallocated_maps(
name = f"wrong_size{tuple(shape_diff)}"
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
r_shape_diff = shape_diff[: r.ndim]
out_shape = [
max((s + sd), 0)
......
......@@ -1097,13 +1097,8 @@ class Function:
return [i.variable for i in self.maker.inputs if i.implicit]
def sync_shared(self):
if hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated:
import pygpu
for i in self.maker.fgraph.update_mapping.values():
inp = self.input_storage[i]
if isinstance(inp.data, pygpu.gpuarray.GpuArray):
inp.data.sync()
# sync was needed on old gpu backend
pass
# pickling/deepcopy support for Function
......
......@@ -5,24 +5,11 @@ from io import StringIO
import numpy as np
import aesara
from aesara.compile.mode import Mode, get_mode
from aesara.compile.mode import Mode
from aesara.configdefaults import config
from aesara.tensor.math import abs as at_abs
from aesara.tensor.math import max as at_max
from aesara.tensor.math import min as at_min
from aesara.tensor.type import discrete_dtypes
try:
from pygpu.gpuarray import GpuArray
from aesara.gpuarray.type import GpuArrayType, _name_for_ctx
pygpu_available = True
except ImportError:
pygpu_available = False
logger = logging.getLogger("aesara.compile.nanguardmode")
......@@ -114,9 +101,6 @@ def contains_nan(arr, node=None, var=None):
return False
elif getattr(arr, "dtype", "") in discrete_dtypes:
return False
elif pygpu_available and isinstance(arr, GpuArray):
return np.isnan(f_gpua_min(arr.reshape(arr.size)))
return np.isnan(np.min(arr))
......@@ -149,36 +133,9 @@ def contains_inf(arr, node=None, var=None):
return False
elif getattr(arr, "dtype", "") in discrete_dtypes:
return False
elif pygpu_available and isinstance(arr, GpuArray):
return np.isinf(f_gpua_min(arr.reshape(arr.size))) or np.isinf(
f_gpua_max(arr.reshape(arr.size))
)
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
def f_compute(op):
def result(inp):
dtype = inp.dtype
ctx_name = _name_for_ctx(inp.context)
key = (dtype, ctx_name)
f = result.cache.get(key, None)
if f is None:
guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
mode = get_mode("FAST_RUN").including("gpuarray")
f = aesara.function([guard_in], op(guard_in), mode=mode, profile=False)
result.cache[key] = f
return f(inp)
result.cache = dict()
return result
f_gpua_min = f_compute(at_min)
f_gpua_max = f_compute(at_max)
f_gpua_absmax = f_compute(lambda x: at_max(at_abs(x)))
class NanGuardMode(Mode):
"""
A Aesara compilation Mode that makes the compiled function automatically
......@@ -252,8 +209,6 @@ class NanGuardMode(Mode):
err = False
if not _is_numeric_value(value, var):
err = False
elif pygpu_available and isinstance(value, GpuArray):
err = f_gpua_absmax(value.reshape(value.size)) > 1e10
else:
err = np.abs(value).max() > 1e10
if err:
......
......@@ -12,10 +12,8 @@ import atexit
import copy
import logging
import operator
import os
import sys
import time
import warnings
from collections import defaultdict
from typing import Dict, List
......@@ -279,40 +277,7 @@ class ProfileStats:
# param is called flag_time_thunks because most other attributes with time
# in the name are times *of* something, rather than configuration flags.
def __init__(
self, atexit_print=True, flag_time_thunks=None, gpu_checks=True, **kwargs
):
if (
gpu_checks
and (hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated)
and os.environ.get("CUDA_LAUNCH_BLOCKING", "0") != "1"
):
msg = (
"You are running the Aesara profiler with CUDA enabled."
" Aesara GPU ops execution is asynchronous by default."
" So by default, the profile is useless."
" You must set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
" synchronize the execution to get a meaningful profile."
)
if config.profile:
raise Exception(msg)
else:
warnings.warn(msg)
if (
config.profile
and gpu_checks
and hasattr(aesara, "gpuarray")
and aesara.gpuarray.pygpu_activated
and not config.profiling__ignore_first_call
):
warnings.warn(
"Aesara flag profiling__ignore_first_call is False. "
"This cause bad profiling result in the gpu "
"back-end, as sometimes we compile at the first call."
)
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
self.apply_callcount = {}
self.output_size = {}
# Keys are `(FunctionGraph, Variable)`
......@@ -543,8 +508,8 @@ class ProfileStats:
tot += t
ftot = tot * 100 / local_time
# Remove the useless start and end of the class name:
# "<class 'aesara.gpuarray.blas.GpuDot22'>" ->
# "aesara.gpuarray.blas.GpuDot22"
# "<class 'aesara.backend.blas.GpuDot22'>" ->
# "aesara.backend.blas.GpuDot22"
class_name = str(a)[8:-2][:maxlen]
print(
format_str
......@@ -922,8 +887,6 @@ class ProfileStats:
new allocation.
"""
from aesara.gpuarray import GpuArrayType
# Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0]
running_memory_size = [0, 0]
......@@ -973,10 +936,8 @@ class ProfileStats:
# allocated by the node
idx2 = 0
for out in node.outputs:
if isinstance(out.type, GpuArrayType):
cg = 1
else:
cg = 0
# NOTE: cg=1 was used for GPU
cg = 0
ins = None
if dmap and idx2 in dmap:
vidx = dmap[idx2]
......@@ -1021,10 +982,8 @@ class ProfileStats:
for ins in set(node.inputs):
assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen
if isinstance(ins.type, GpuArrayType):
cg = 1
else:
cg = 0
# NOTE: cg=1 was used for GPU
cg = 0
if (
dependencies[ins]
and ins not in fgraph.outputs
......@@ -1687,27 +1646,7 @@ class ProfileStats:
)
printed_tip = True
# tip 7
import aesara.gpuarray
import aesara.tensor.signal.pool as pool
from aesara.tensor.nnet.basic import LogSoftmax
for (fgraph, a) in self.apply_time:
node = a
if isinstance(node.op, pool.Pool):
if not aesara.gpuarray.dnn.dnn_present():
print(
"Install CuDNN to do pooling faster"
"this allows the operation to run on GPU"
)
printed_tip = True
if isinstance(node.op, LogSoftmax):
if not aesara.gpuarray.dnn.dnn_present():
print(
"Install CuDNN to do LogSoftmax faster"
"this allows the operation to run on GPU"
)
printed_tip = True
# tip 7 was about pool and log softmax on gpu using cudnn
if not printed_tip:
print(" Sorry, no tip for today.", file=file)
......
......@@ -292,9 +292,7 @@ def add_basic_configvars():
config.add(
"warn_float64",
"Do an action when a tensor variable with float64 dtype is"
" created. They can't be run on the GPU with the current(old)"
" gpu back-end and are slow with gamer GPUs.",
"Do an action when a tensor variable with float64 dtype is created.",
EnumStr("ignore", ["warn", "raise", "pdb"]),
in_c_key=False,
)
......@@ -326,10 +324,7 @@ def add_basic_configvars():
config.add(
"deterministic",
"If `more`, sometimes we will select some implementation that "
"are more deterministic, but slower. In particular, on the GPU, "
"we will avoid using AtomicAdd. Sometimes we will still use "
"non-deterministic implementation, e.g. when we do not have a GPU "
"implementation that is deterministic. Also see "
"are more deterministic, but slower. Also see "
"the dnn.conv.algo* flags to cover more cases.",
EnumStr("default", ["more"]),
in_c_key=False,
......@@ -405,56 +400,56 @@ def add_basic_configvars():
in_c_key=False,
)
config.add(
"gpuarray__preallocate",
"""If negative it disables the allocation cache. If
between 0 and 1 it enables the allocation cache and
preallocates that fraction of the total GPU memory. If 1
or greater it will preallocate that amount of memory (in
megabytes).""",
FloatParam(0, mutable=False),
in_c_key=False,
)
config.add(
"gpuarray__sched",
"""The sched parameter passed for context creation to pygpu.
With CUDA, using "multi" is equivalent to using the parameter
cudaDeviceScheduleBlockingSync. This is useful to lower the
CPU overhead when waiting for GPU. One user found that it
speeds up his other processes that was doing data augmentation.
""",
EnumStr("default", ["multi", "single"]),
)
config.add(
"gpuarray__single_stream",
"""
If your computations are mostly lots of small elements,
using single-stream will avoid the synchronization
overhead and usually be faster. For larger elements it
does not make a difference yet. In the future when true
multi-stream is enabled in libgpuarray, this may change.
If you want to make sure to have optimal performance,
check both options.
""",
BoolParam(True),
in_c_key=False,
)
config.add(
"cuda__root",
"Location of the cuda installation",
StrParam(get_cuda_root),
in_c_key=False,
)
config.add(
"cuda__include_path",
"Location of the cuda includes",
StrParam(default_cuda_include),
in_c_key=False,
)
# config.add(
# "gpuarray__preallocate",
# """If negative it disables the allocation cache. If
# between 0 and 1 it enables the allocation cache and
# preallocates that fraction of the total GPU memory. If 1
# or greater it will preallocate that amount of memory (in
# megabytes).""",
# FloatParam(0, mutable=False),
# in_c_key=False,
# )
# config.add(
# "gpuarray__sched",
# """The sched parameter passed for context creation to pygpu.
# With CUDA, using "multi" is equivalent to using the parameter
# cudaDeviceScheduleBlockingSync. This is useful to lower the
# CPU overhead when waiting for GPU. One user found that it
# speeds up his other processes that was doing data augmentation.
# """,
# EnumStr("default", ["multi", "single"]),
# )
# config.add(
# "gpuarray__single_stream",
# """
# If your computations are mostly lots of small elements,
# using single-stream will avoid the synchronization
# overhead and usually be faster. For larger elements it
# does not make a difference yet. In the future when true
# multi-stream is enabled in libgpuarray, this may change.
# If you want to make sure to have optimal performance,
# check both options.
# """,
# BoolParam(True),
# in_c_key=False,
# )
# config.add(
# "cuda__root",
# "Location of the cuda installation",
# StrParam(get_cuda_root),
# in_c_key=False,
# )
# config.add(
# "cuda__include_path",
# "Location of the cuda includes",
# StrParam(default_cuda_include),
# in_c_key=False,
# )
# This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph.
......@@ -483,103 +478,103 @@ def add_basic_configvars():
)
def add_dnn_configvars():
config.add(
"dnn__conv__algo_fwd",
"Default implementation to use for cuDNN forward convolution.",
EnumStr("small", SUPPORTED_DNN_CONV_ALGO_FWD),
in_c_key=False,
)
config.add(
"dnn__conv__algo_bwd_data",
"Default implementation to use for cuDNN backward convolution to "
"get the gradients of the convolution with regard to the inputs.",
EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_DATA),
in_c_key=False,
)
config.add(
"dnn__conv__algo_bwd_filter",
"Default implementation to use for cuDNN backward convolution to "
"get the gradients of the convolution with regard to the "
"filters.",
EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_FILTER),
in_c_key=False,
)
config.add(
"dnn__conv__precision",
"Default data precision to use for the computation in cuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions, or float32 if inputs are float16).",
EnumStr("as_input_f32", SUPPORTED_DNN_CONV_PRECISION),
in_c_key=False,
)
config.add(
"dnn__base_path",
"Install location of cuDNN.",
StrParam(default_dnn_base_path),
in_c_key=False,
)
config.add(
"dnn__include_path",
"Location of the cudnn header",
StrParam(default_dnn_inc_path),
in_c_key=False,
)
config.add(
"dnn__library_path",
"Location of the cudnn link library.",
StrParam(default_dnn_lib_path),
in_c_key=False,
)
config.add(
"dnn__bin_path",
"Location of the cuDNN load library "
"(on non-windows platforms, "
"this is the same as dnn__library_path)",
StrParam(default_dnn_bin_path),
in_c_key=False,
)
config.add(
"dnn__enabled",
"'auto', use cuDNN if available, but silently fall back"
" to not using it if not present."
" If True and cuDNN can not be used, raise an error."
" If False, disable cudnn even if present."
" If no_check, assume present and the version between header and library match (so less compilation at context init)",
EnumStr("auto", ["True", "False", "no_check"]),
in_c_key=False,
)
def add_magma_configvars():
config.add(
"magma__include_path",
"Location of the magma header",
StrParam(""),
in_c_key=False,
)
config.add(
"magma__library_path",
"Location of the magma library",
StrParam(""),
in_c_key=False,
)
config.add(
"magma__enabled",
" If True, use magma for matrix computation." " If False, disable magma",
BoolParam(False),
in_c_key=False,
)
# def add_dnn_configvars():
# config.add(
# "dnn__conv__algo_fwd",
# "Default implementation to use for cuDNN forward convolution.",
# EnumStr("small", SUPPORTED_DNN_CONV_ALGO_FWD),
# in_c_key=False,
# )
# config.add(
# "dnn__conv__algo_bwd_data",
# "Default implementation to use for cuDNN backward convolution to "
# "get the gradients of the convolution with regard to the inputs.",
# EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_DATA),
# in_c_key=False,
# )
# config.add(
# "dnn__conv__algo_bwd_filter",
# "Default implementation to use for cuDNN backward convolution to "
# "get the gradients of the convolution with regard to the "
# "filters.",
# EnumStr("none", SUPPORTED_DNN_CONV_ALGO_BWD_FILTER),
# in_c_key=False,
# )
# config.add(
# "dnn__conv__precision",
# "Default data precision to use for the computation in cuDNN "
# "convolutions (defaults to the same dtype as the inputs of the "
# "convolutions, or float32 if inputs are float16).",
# EnumStr("as_input_f32", SUPPORTED_DNN_CONV_PRECISION),
# in_c_key=False,
# )
# config.add(
# "dnn__base_path",
# "Install location of cuDNN.",
# StrParam(default_dnn_base_path),
# in_c_key=False,
# )
# config.add(
# "dnn__include_path",
# "Location of the cudnn header",
# StrParam(default_dnn_inc_path),
# in_c_key=False,
# )
# config.add(
# "dnn__library_path",
# "Location of the cudnn link library.",
# StrParam(default_dnn_lib_path),
# in_c_key=False,
# )
# config.add(
# "dnn__bin_path",
# "Location of the cuDNN load library "
# "(on non-windows platforms, "
# "this is the same as dnn__library_path)",
# StrParam(default_dnn_bin_path),
# in_c_key=False,
# )
# config.add(
# "dnn__enabled",
# "'auto', use cuDNN if available, but silently fall back"
# " to not using it if not present."
# " If True and cuDNN can not be used, raise an error."
# " If False, disable cudnn even if present."
# " If no_check, assume present and the version between header and library match (so less compilation at context init)",
# EnumStr("auto", ["True", "False", "no_check"]),
# in_c_key=False,
# )
# def add_magma_configvars():
# config.add(
# "magma__include_path",
# "Location of the magma header",
# StrParam(""),
# in_c_key=False,
# )
# config.add(
# "magma__library_path",
# "Location of the magma library",
# StrParam(""),
# in_c_key=False,
# )
# config.add(
# "magma__enabled",
# " If True, use magma for matrix computation." " If False, disable magma",
# BoolParam(False),
# in_c_key=False,
# )
def _is_gt_0(x):
......@@ -682,11 +677,10 @@ def add_compile_configvars():
if type(config).cxx.is_default:
# If the user provided an empty value for cxx, do not warn.
_logger.warning(
"g++ not detected ! Aesara will be unable to execute "
"optimized C-implementations (for both CPU and GPU) and will "
"default to Python implementations. Performance will be severely "
"degraded. To remove this warning, set Aesara flags cxx to an "
"empty string."
"g++ not detected! Aesara will be unable to compile "
"C-implementations and will default to Python. "
"Performance may be severely degraded. "
"To remove this warning, set Aesara flags cxx to an empty string."
)
# Keep the default value the same as the one for the mode FAST_RUN
......@@ -899,20 +893,20 @@ def add_traceback_configvars():
def add_experimental_configvars():
config.add(
"experimental__unpickle_gpu_on_cpu",
"Allow unpickling of pickled GpuArrays as numpy.ndarrays."
"This is useful, if you want to open a GpuArray without "
"having cuda installed."
"If you have cuda installed, this will force unpickling to"
"be done on the cpu to numpy.ndarray."
"Please be aware that this may get you access to the data,"
"however, trying to unpicke gpu functions will not succeed."
"This flag is experimental and may be removed any time, when"
"gpu<>cpu transparency is solved.",
BoolParam(default=False),
in_c_key=False,
)
# config.add(
# "experimental__unpickle_gpu_on_cpu",
# "Allow unpickling of pickled GpuArrays as numpy.ndarrays."
# "This is useful, if you want to open a GpuArray without "
# "having cuda installed."
# "If you have cuda installed, this will force unpickling to"
# "be done on the cpu to numpy.ndarray."
# "Please be aware that this may get you access to the data,"
# "however, trying to unpicke gpu functions will not succeed."
# "This flag is experimental and may be removed any time, when"
# "gpu<>cpu transparency is solved.",
# BoolParam(default=False),
# in_c_key=False,
# )
config.add(
"experimental__local_alloc_elemwise",
......@@ -1473,10 +1467,6 @@ def add_numba_configvars():
)
def _get_default_gpuarray__cache_path():
return os.path.join(config.compiledir, "gpuarray_kernels")
def _default_compiledirname():
formatted = config.compiledir_format % _compiledir_format_dict
safe = re.sub(r"[\(\)\s,]+", "_", formatted)
......@@ -1618,16 +1608,16 @@ def add_caching_dir_configvars():
in_c_key=False,
)
config.add(
"gpuarray__cache_path",
"Directory to cache pre-compiled kernels for the gpuarray backend.",
ConfigParam(
_get_default_gpuarray__cache_path,
apply=_filter_base_compiledir,
mutable=False,
),
in_c_key=False,
)
# config.add(
# "gpuarray__cache_path",
# "Directory to cache pre-compiled kernels for the gpuarray backend.",
# ConfigParam(
# _get_default_gpuarray__cache_path,
# apply=_filter_base_compiledir,
# mutable=False,
# ),
# in_c_key=False,
# )
# Those are the options provided by Aesara to choose algorithms at runtime.
......@@ -1686,10 +1676,9 @@ config = aesara.configparser._config
# The functions below register config variables into the config instance above.
add_basic_configvars()
add_dnn_configvars()
add_magma_configvars()
# add_dnn_configvars()
# add_magma_configvars()
add_compile_configvars()
# TODO: "tensor", "gpuarray" and compilation options are closely related.. Grouping is not great.
add_tensor_configvars()
add_traceback_configvars()
add_experimental_configvars()
......
......@@ -456,15 +456,13 @@ class DeviceParam(ConfigParam):
)
def _apply(self, val):
if val == self.default or val.startswith("opencl") or val.startswith("cuda"):
return val
elif val.startswith("gpu"):
if val.startswith("opencl") or val.startswith("cuda") or val.startswith("gpu"):
raise ValueError(
"You are trying to use the old GPU back-end. "
"It was removed from Aesara. Use device=cuda* now. "
"See https://github.com/aesara-devs/aesara/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29 "
"for more information."
"It was removed from Aesara."
)
elif val == self.default:
return val
else:
raise ValueError(
'Invalid value ("{val}") for configuration '
......
......@@ -229,8 +229,8 @@ class Apply(Node):
List of `Variable` instances to use as inputs.
strict : bool
If ``True``, the type fields of all the inputs must be equal
to the current ones (or compatible, for instance `Tensor` /
`GpuArray` of the same dtype and broadcastable patterns,
to the current ones (or compatible, for instance `TensorType`
of the same dtype and broadcastable patterns,
in which case they will be converted into current `Type`), and
returned outputs are guaranteed to have the same types as
``self.outputs``. If ``False``, then there's no guarantee that the
......@@ -328,9 +328,6 @@ class Variable(Node):
- `SparseVariable`: a subclass of `Variable` that represents
a ``scipy.sparse.{csc,csr}_matrix`` object.
- `GpuArrayVariable`: a subclass of `Variable` that represents our object on
the GPU that is a subset of ``numpy.ndarray``.
- `RandomVariable`.
A `Variable` which is the output of a symbolic computation will have an owner
......
......@@ -70,9 +70,9 @@ class IfElse(_NoPythonOp):
"""
__props__ = ("as_view", "gpu", "n_outs")
__props__ = ("as_view", "n_outs")
def __init__(self, n_outs, as_view=False, gpu=False, name=None):
def __init__(self, n_outs, as_view=False, name=None):
if as_view:
# check destroyhandler and others to ensure that a view_map with
# multiple inputs can work
......@@ -81,7 +81,6 @@ class IfElse(_NoPythonOp):
view_map[idx] = [idx + 1]
self.view_map = view_map
self.as_view = as_view
self.gpu = gpu
self.n_outs = n_outs
self.name = name
......@@ -90,14 +89,12 @@ class IfElse(_NoPythonOp):
return False
if self.as_view != other.as_view:
return False
if self.gpu != other.gpu:
return False
if self.n_outs != other.n_outs:
return False
return True
def __hash__(self):
return hash((type(self), self.as_view, self.gpu, self.n_outs))
return hash((type(self), self.as_view, self.n_outs))
def __str__(self):
args = []
......@@ -105,8 +102,6 @@ class IfElse(_NoPythonOp):
args.append(self.name)
if self.as_view:
args.append("inplace")
if self.gpu:
args.append("gpu")
return f"if{{{','.join(args)}}}"
def infer_shape(self, fgraph, node, inputs_shapes):
......@@ -143,7 +138,6 @@ class IfElse(_NoPythonOp):
new_ifelse = IfElse(
n_outs=len(new_ts_inputs),
as_view=False,
gpu=False,
name="_".join(name_tokens),
)
new_outs = new_ifelse(
......@@ -172,16 +166,13 @@ class IfElse(_NoPythonOp):
f"{int(2 * self.n_outs)}, got {len(args)}"
)
c = at.basic.as_tensor_variable(c)
if not self.gpu:
# When gpu is true, we are given only gpuarrays, and we want
# to keep them as gpuarrays
nw_args = []
for x in args:
if isinstance(x, Variable):
nw_args.append(x)
else:
nw_args.append(at.as_tensor_variable(x))
args = nw_args
nw_args = []
for x in args:
if isinstance(x, Variable):
nw_args.append(x)
else:
nw_args.append(at.as_tensor_variable(x))
args = nw_args
aes = args[: self.n_outs]
fs = args[self.n_outs :]
......@@ -214,13 +205,9 @@ class IfElse(_NoPythonOp):
else:
nw_name_t = None
nw_name_f = None
if_true_op = IfElse(
n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_t
)
if_true_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_t)
if_false_op = IfElse(
n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_f
)
if_false_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_f)
# The grads can have a different dtype then the inputs.
# As inputs true/false pair must have the same dtype,
......@@ -384,7 +371,7 @@ def ifelse(
f"{len(else_branch)})"
)
new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, gpu=False, name=name)
new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, name=name)
ins = [condition] + list(new_then_branch) + list(new_else_branch)
rval = new_ifelse(*ins, return_list=True)
......@@ -411,7 +398,7 @@ def cond_make_inplace(fgraph, node):
or not all(getattr(o.type, "ndim", -1) == 0 for o in node.outputs)
)
):
return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(
return IfElse(n_outs=op.n_outs, as_view=True, name=op.name)(
*node.inputs, return_list=True
)
return False
......@@ -611,7 +598,6 @@ class CondMerge(GlobalOptimizer):
new_ifelse = IfElse(
n_outs=len(mn_ts + pl_ts),
as_view=False,
gpu=False,
name=mn_name + "&" + pl_name,
)
new_outs = new_ifelse(*new_ins, return_list=True)
......@@ -660,7 +646,7 @@ def cond_remove_identical(fgraph, node):
nw_ts.append(aes[idx])
nw_fs.append(fs[idx])
new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, gpu=op.gpu, name=op.name)
new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, name=op.name)
new_ins = [node.inputs[0]] + nw_ts + nw_fs
new_outs = new_ifelse(*new_ins, return_list=True)
......@@ -712,7 +698,6 @@ def cond_merge_random_op(fgraph, main_node):
new_ifelse = IfElse(
n_outs=len(mn_ts + pl_ts),
as_view=False,
gpu=False,
name=mn_name + "&" + pl_name,
)
new_outs = new_ifelse(*new_ins, return_list=True)
......
......@@ -790,9 +790,6 @@ class ModuleCache:
if subdirs_elem == "lock_dir":
continue
root = os.path.join(self.dirname, subdirs_elem)
# Don't delete the gpuarray kernel cache
if root == config.gpuarray__cache_path:
continue
key_pkl = os.path.join(root, "key.pkl")
if key_pkl in self.loaded_key_pkl:
continue
......
......@@ -496,8 +496,6 @@ class CLinkerType(CLinkerObject):
e.g:
- For ``TensorType(dtype='int64', ...)``: should return ``"npy_int64"``.
- For ``GpuArrayType(dtype='int32', ...)``: should return ``"ga_int"``.
"""
return ""
......
......@@ -7,7 +7,7 @@ used to create a Params object that is compatible with the ParamsType defined.
The Params object will be available in both Python code (as a standard Python object) and C code
(as a specific struct with parameters as struct fields). To be fully-available in C code, Aesara
types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType, GpuArrayType,
types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType,
or your own type. See :ref:`extending_op_params` for more details).
Example of usage
......@@ -318,9 +318,8 @@ class Params(dict):
class ParamsType(CType):
"""
This class can create a struct of Aesara types (like `TensorType`,
`GpuArrayType`, etc.) to be used as a convenience op parameter wrapping
many data.
This class can create a struct of Aesara types (like `TensorType`, etc.)
to be used as a convenience `Op` parameter wrapping many data.
`ParamsType` constructor takes key-value args. Key will be the name of the
attribute in the struct. Value is the Aesara type of this attribute,
......
"""This script trigger convolution operation. We think it cause more
GPU power consumption then gemm call.
"""
import numpy as np
import aesara
from aesara.configdefaults import config
from aesara.gpuarray import dnn
from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
from aesara.tensor.type import tensor4
def burn():
sz = 128
img_shp = [sz, sz, sz, sz]
kern_shp = [sz // 2, sz, 3, 3]
out_shp = get_conv_output_shape(img_shp, kern_shp, "valid", (1, 1))
img = tensor4("img")
kern = tensor4("kern")
out = tensor4("out")
def rand(shp):
return np.random.rand(*shp).astype(config.floatX)
img = aesara.shared(rand(img_shp))
kern = aesara.shared(rand(kern_shp))
out = aesara.shared(rand(out_shp))
# beta 1 is needed to force the reuse of out, otherwise, it is
# replaced by a GpuAllocEmpty
o1 = dnn._dnn_conv(img, kern, conv_mode="conv", out=out, beta=1.0)
mode = aesara.compile.get_default_mode().including("local_remove_all_assert")
f = aesara.function([], [o1], mode=mode)
aesara.printing.debugprint(f)
print("Start computation")
for i in range(10000):
f.fn()
print("Computation stopped")
if __name__ == "__main__":
burn()
......@@ -78,12 +78,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
f() # Ignore first function call to get representative time.
if execute:
try:
from aesara.gpuarray import GpuArraySharedVariable
sync = isinstance(c, GpuArraySharedVariable)
except ImportError:
sync = False
# sync was needed for gpu
sync = False
if sync:
# Make sure we don't include the time from the first call
......
#! /usr/bin/env python
"""
This file compare the runtime of two independent dot products on one
and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import threading
import time
import numpy as np
import aesara
from aesara.gpuarray import init_dev
from aesara.gpuarray.blas import gpu_dot22
def main(dev1, dev2):
init_dev(dev1, "ctx1")
init_dev(dev2, "ctx2")
size = 1024 * 16
data = np.random.randn(size, size).astype("float32")
val1a = aesara.shared(data, target="ctx1")
val1b = aesara.shared(data, target="ctx1")
val1c = aesara.shared(data, target="ctx1")
val1d = aesara.shared(data, target="ctx1")
val2a = aesara.shared(data, target="ctx2")
val2b = aesara.shared(data, target="ctx2")
f1 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val1c, val1d)])
f2 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val2a, val2b)])
f3 = aesara.function([], [gpu_dot22(val1a, val1b)])
f4 = aesara.function([], [gpu_dot22(val2a, val2b)])
f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")])
f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")])
# pre-execute to load code to GPU.
r = f1.fn()
r[0].sync(), r[1].sync()
r = f2.fn()
r[0].sync(), r[1].sync()
r = f3.fn()
r[0].sync()
r = f4.fn()
r[0].sync()
r = f5.fn()
r = f6.fn()
r = None
t = time.time()
r = f1.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print(f"one ctx async {t2 - t:f}")
t = time.time()
r = f2.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print(f"two ctx async {t2 - t:f}")
t = time.time()
r = f3.fn()
r2 = f4.fn()
r[0].sync()
r2[0].sync()
t2 = time.time()
r = None
print(f"two ctx, 2 fct async {t2 - t:f}")
t = time.time()
r = f5.fn()
r2 = f6.fn()
t2 = time.time()
r = None
print(f"two ctx, 2 fct with transfer {t2 - t:f}")
# Multi-thread version
class myThread(threading.Thread):
def __init__(self, name, f, sync):
threading.Thread.__init__(self)
self.f = f
self.name = name
self.sync = sync
def run(self):
# print "Starting " + self.name
# r = self.f.fn(n_calls=10)
r = self.f()
# print "End " + self.name
if self.sync:
r[0].sync()
self.r = r
# print "Exiting " + self.name
thread1 = myThread("Thread-3", f3, True)
thread2 = myThread("Thread-4", f4, True)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}")
thread1 = myThread("Thread-5", f5, False)
thread2 = myThread("Thread-6", f6, False)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
raise ValueError("This script require two device names.")
main(sys.argv[1], sys.argv[2])
"""
Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
Function to detect memory sharing for ndarray AND sparse type.
numpy version support only ndarray.
"""
......@@ -18,48 +18,22 @@ try:
return scipy.sparse.issparse(a)
except ImportError:
# scipy not imported, their can be only ndarray and gpuarray
def _is_sparse(a):
return False
from aesara import gpuarray
if gpuarray.pygpu:
def _is_gpua(a):
return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
else:
def _is_gpua(a):
def _is_sparse(a):
return False
__docformat__ = "restructuredtext en"
def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, np.ndarray)
b_ndarray = isinstance(b, np.ndarray)
if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b)
a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b)
if a_gpua and b_gpua:
return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b)
if not (a_ndarray or a_sparse or a_gpua) or not (b_ndarray or b_sparse or b_gpua):
if not (a_ndarray or a_sparse) or not (b_ndarray or b_sparse):
if raise_other_type:
raise TypeError(
"may_share_memory support only ndarray"
" and scipy.sparse or GpuArray type"
)
raise TypeError("may_share_memory support only ndarray" " and scipy.sparse")
return False
if a_gpua or b_gpua:
return False
return SparseTensorType.may_share_memory(a, b)
......@@ -9,7 +9,6 @@ import os
import pickle
import sys
import tempfile
import warnings
import zipfile
from collections import defaultdict
from contextlib import closing
......@@ -27,7 +26,6 @@ except ImportError:
DEFAULT_PROTOCOL = HIGHEST_PROTOCOL
from aesara.compile.sharedvalue import SharedVariable
from aesara.configdefaults import config
__docformat__ = "restructuredtext en"
......@@ -121,30 +119,7 @@ class PersistentNdarrayID:
return self.seen[id(obj)]
class PersistentGpuArrayID(PersistentNdarrayID):
def __call__(self, obj):
from aesara.gpuarray.type import _name_for_ctx
try:
import pygpu
except ImportError:
pygpu = None
if pygpu and isinstance(obj, pygpu.gpuarray.GpuArray):
if id(obj) not in self.seen:
def write_array(f):
pickle.dump(_name_for_ctx(obj.context), f, 2)
np.lib.format.write_array(f, np.asarray(obj))
name = self._resolve_name(obj)
zipadd(write_array, self.zip_file, name)
self.seen[id(obj)] = f"gpuarray.{name}"
return self.seen[id(obj)]
return super().__call__(obj)
class PersistentSharedVariableID(PersistentGpuArrayID):
class PersistentSharedVariableID(PersistentNdarrayID):
"""Uses shared variable names when persisting to zip file.
If a shared variable has a name, this name is used as the name of the
......@@ -213,32 +188,16 @@ class PersistentNdarrayLoad:
self.cache = {}
def __call__(self, persid):
from aesara.gpuarray import pygpu
from aesara.gpuarray.type import get_context
array_type, name = persid.split(".")
del array_type
# array_type was used for switching gpu/cpu arrays
# it is better to put these into sublclasses properly
# this is more work but better logic
if name in self.cache:
return self.cache[name]
ret = None
if array_type == "gpuarray":
with self.zip_file.open(name) as f:
ctx_name = pickle.load(f)
array = np.lib.format.read_array(f)
if config.experimental__unpickle_gpu_on_cpu:
# directly return numpy array
warnings.warn(
"config.experimental__unpickle_gpu_on_cpu is set "
"to True. Unpickling GpuArray as numpy.ndarray"
)
ret = array
elif pygpu:
ret = pygpu.array(array, context=get_context(ctx_name))
else:
raise ImportError("pygpu not found. Cannot unpickle GpuArray")
else:
with self.zip_file.open(name) as f:
ret = np.lib.format.read_array(f)
with self.zip_file.open(name) as f:
ret = np.lib.format.read_array(f)
self.cache[name] = ret
return ret
......
......@@ -12,7 +12,7 @@ from aesara.graph.op import get_test_value
from aesara.graph.utils import MissingInputError, TestValueError
from aesara.scan import utils
from aesara.scan.op import Scan, ScanInfo
from aesara.scan.utils import safe_new, traverse
from aesara.scan.utils import safe_new
from aesara.tensor.exceptions import NotScalarConstantError
from aesara.tensor.math import minimum
from aesara.tensor.shape import shape_padleft
......@@ -968,29 +968,8 @@ def scan(
)
if condition is not None:
inner_outs.append(condition)
# gpuarray is imported here, instead of being imported on top of
# the file because that would force on the user some dependencies that we
# might do not want to. Currently we are working on removing the
# dependencies on sandbox code completely.
from aesara import gpuarray
if gpuarray.pygpu_activated:
# very often we end up in this situation when we want to
# replace w with w_copy, where w is a GPU variable
# and w_copy is TensorType. This is caused because shared
# variables are put on GPU right away >:| ,
new_givens = OrderedDict()
for w, w_copy in givens.items():
if isinstance(w.type, gpuarray.GpuArrayType) and isinstance(
w_copy.type, TensorType
):
for o in inner_outs:
new_givens = traverse(o, w, w_copy, new_givens)
else:
new_givens[w] = w_copy
else:
new_givens = givens
# NOTE: legacy code traversed GPU types
new_givens = givens
new_outs = clone_replace(inner_outs, replace=new_givens)
......@@ -1023,7 +1002,6 @@ def scan(
mode=mode,
truncate_gradient=truncate_gradient,
name=name,
gpua=False,
as_while=as_while,
profile=profile,
allow_gc=allow_gc,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -180,8 +180,7 @@ def check_broadcast(v1, v2):
def copy_var_format(var, as_var):
"""
This functions ensures that ``var`` has the same dtype as ``as_var`` as
well as calling `filter_variable` to make sure they are both `TensorType`
or `GpuArrayType`.
well as calling `filter_variable` to make sure they are both `TensorType`.
It internally deals with the corner case where ``inp.ndim + 1 = out.ndim``.
......@@ -549,32 +548,6 @@ class ScanMethodsMixin:
f"type '{type_input}' and '{type_output}' respectively."
)
# If scan has the flag 'gpua' set to false (meaning that is shouldn't
# use the gpuarray gpu backend ), ensure that is has no input and no
# output with type GpuArrayType
from aesara.gpuarray import GpuArrayType
if not self.gpua:
for inp in self.inputs:
if isinstance(inp.type, GpuArrayType):
raise TypeError(
"Inconsistency in the inner graph of "
f"scan '{self.name}' : one of the inputs to the "
"inner graph is of type GpuArrayType but "
"the attributes of the scan op indicate "
"that it shouldn't be the case"
)
for out in self.outputs:
if isinstance(out.type, GpuArrayType):
raise TypeError(
"Inconsistency in the inner graph of "
f"scan '{self.name}' : one of the outputs to the "
"inner graph is of type GpuArrayType but "
"the attributes of the scan op indicate "
"that it shouldn't be the case"
)
class Scan(Op, ScanMethodsMixin, HasInnerGraph):
r"""An `Op` implementing `for` and `while` loops.
......@@ -616,7 +589,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
typeConstructor: Optional[TensorConstructorType] = None,
truncate_gradient: bool = False,
name: Optional[str] = None,
gpua: bool = False,
as_while: bool = False,
profile: Optional[Union[str, bool]] = None,
allow_gc: bool = True,
......@@ -666,8 +638,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
as well as profiles for the computation of one step of each instance of
`Scan`. The `name` of the instance appears in those profiles and can
greatly help to disambiguate information.
gpua
If ``True``, this `Op` should run on a GPU.
as_while
Whether or not the `Scan` is a ``while``-loop.
profile
......@@ -690,34 +660,15 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
speed up allocation of the subsequent iterations. All those temporary
allocations are freed at the end of all iterations; this is what the
flag `aesara.config.allow_gc` means.
If you use pre-allocation and this `Scan` is on GPU, the speed up from
`allow_gc` is small. If you are missing memory, disabling `allow_gc`
could help you run graph that request much memory.
strict
If ``True``, all the shared variables used in the inner-graph must be provided.
Notes
-----
`typeConstructor` had been added to refactor how Aesara deals with the
GPU. If it runs on the GPU, `Scan` needs to construct certain outputs
(those that reside in GPU memory) as the GPU-specific `Type`. Since we
cannot import GPU code here, the GPU optimizations pass the constructor
of this class a function that is able to construct a GPU `Type`. This
way the class `Scan` does not need to be aware of the GPU details--it
simply constructs tensors using this function (which by default
constructs normal tensors).
TODO: Clean up this approach and everything else related to GPUs; it's
all currently a very leaky set of abstractions.
"""
self.inputs = inputs
self.outputs = outputs
self.info = info
self.truncate_gradient = truncate_gradient
self.name = name
self.gpua = gpua
self.as_while = as_while
self.profile = profile
self.allow_gc = allow_gc
......@@ -789,17 +740,14 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
self.n_outs = self.n_mit_mot + self.n_mit_sot + self.n_sit_sot
self.n_tap_outs = self.n_mit_mot + self.n_mit_sot
if self.gpua:
self._hash_inner_graph = self.gpu_hash
else:
# Do the missing inputs check here to have the error early.
for var in graph_inputs(self.outputs, self.inputs):
if var not in self.inputs and not isinstance(var, Constant):
raise MissingInputError(f"ScanOp is missing an input: {repr(var)}")
self._cmodule_key = CLinker().cmodule_key_variables(
self.inputs, self.outputs, []
)
self._hash_inner_graph = hash(self._cmodule_key)
# Do the missing inputs check here to have the error early.
for var in graph_inputs(self.outputs, self.inputs):
if var not in self.inputs and not isinstance(var, Constant):
raise MissingInputError(f"ScanOp is missing an input: {repr(var)}")
self._cmodule_key = CLinker().cmodule_key_variables(
self.inputs, self.outputs, []
)
self._hash_inner_graph = hash(self._cmodule_key)
(
self.preallocated_mitmot_outs,
......@@ -1185,9 +1133,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
if self.info != other.info:
return False
if self.gpua != other.gpua:
return False
if self.as_while != other.as_while:
return False
......@@ -1220,10 +1165,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
)
def __str__(self):
if self.gpua:
gpu_str = "gpu"
else:
gpu_str = "cpu"
device_str = "cpu"
if self.as_while:
name = "do_while"
else:
......@@ -1242,7 +1184,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
aux_txt += "},%s,%s}"
else:
aux_txt += "{%s,%s}"
aux_txt = aux_txt % (name, gpu_str, str(self.name))
aux_txt = aux_txt % (name, device_str, str(self.name))
return aux_txt
def __hash__(self):
......@@ -1251,7 +1193,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
type(self),
self._hash_inner_graph,
self.info,
self.gpua,
self.as_while,
self.profile,
self.truncate_gradient,
......@@ -1418,9 +1359,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
# Analyse the compile inner function to determine which inputs and
# outputs are on the gpu and speed up some checks during the execution
inps_is_tensor = [
isinstance(out, TensorVariable) for out in self.fn.maker.fgraph.inputs
]
outs_is_tensor = [
isinstance(out, TensorVariable) for out in self.fn.maker.fgraph.outputs
]
......@@ -1441,7 +1379,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
self.mitmots_preallocated, dtype="int32"
)
cython_inps_is_tensor = np.asarray(inps_is_tensor, dtype="int32")
cython_outs_is_tensor = np.asarray(outs_is_tensor, dtype="int32")
if self.destroy_map:
......@@ -1499,7 +1436,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
cython_vector_outs,
self.mit_mot_out_slices,
cython_mitmots_preallocated,
cython_inps_is_tensor,
cython_outs_is_tensor,
inner_input_storage,
inner_output_storage,
......@@ -1762,7 +1698,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
pdx = offset + self.n_shared_outs
inner_output_storage[pdx].storage[0] = None
# 4.5. Keep a reference to the variables (ndarrays, GpuArrays,
# 4.5. Keep a reference to the variables (ndarrays,
# etc) currently in the output_storage to be able to compare them
# with the actual outputs of the inner function after its
# execution. Also keep pointers to their data to be able to detect
......@@ -1778,9 +1714,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
elif isinstance(self.fn.maker.fgraph.outputs[idx], TensorVariable):
old_inner_output_data[idx] = var.data
else:
old_inner_output_data[idx] = var.gpudata
raise RuntimeError("old_inner_output_data[idx] = var.gpudata")
# 4.6. Keep a reference to the variables (ndarrays, GpuArrays,
# 4.6. Keep a reference to the variables (ndarrays,
# etc) associated with mitmot inputs currently in the
# input_storage to be able to compare them with the content of the
# input_storage after the execution of the function. Also keep
......@@ -1793,12 +1729,8 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
if var is None:
old_mitmot_input_data[idx] = None
elif isinstance(
self.fn.maker.fgraph.inputs[idx + self.n_seqs], TensorVariable
):
old_mitmot_input_data[idx] = var.data
else:
old_mitmot_input_data[idx] = var.gpudata
old_mitmot_input_data[idx] = var.data
# 5.1 compute outputs
t0_fn = time.time()
......@@ -1865,13 +1797,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
new_var = inner_input_storage[self.n_seqs + inp_idx].storage[0]
if old_var is new_var:
old_data = old_mitmot_input_data[inp_idx]
if isinstance(
self.fn.maker.fgraph.inputs[self.n_seqs + inp_idx],
TensorVariable,
):
same_data = new_var.data == old_data
else:
same_data = new_var.gpudata == old_data
same_data = new_var.data == old_data
else:
same_data = False
......@@ -1922,7 +1848,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
):
output_reused = new_var.data == old_data
else:
output_reused = new_var.gpudata == old_data
raise RuntimeError(
"output_reused = new_var.gpudata == old_data"
)
else:
output_reused = False
......@@ -1986,7 +1914,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
):
output_reused = new_var.data == old_data
else:
output_reused = new_var.gpudata == old_data
raise RuntimeError(
"output_reused = new_var.gpudata == old_data"
)
else:
output_reused = False
......@@ -2888,7 +2818,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
info,
mode=self.mode,
truncate_gradient=self.truncate_gradient,
gpua=False,
as_while=False,
profile=self.profile,
name=f"grad_of_{self.name}" if self.name else None,
......@@ -3219,7 +3148,6 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
inner_outs,
info,
mode=self.mode,
gpua=False,
as_while=self.as_while,
profile=self.profile,
truncate_gradient=self.truncate_gradient,
......
......@@ -176,7 +176,6 @@ def remove_constants_and_unused_inputs_scan(fgraph, node):
op_outs,
nw_info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
......@@ -341,7 +340,6 @@ def push_out_non_seq_scan(fgraph, node):
op_outs,
op.info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
......@@ -591,7 +589,6 @@ def push_out_seq_scan(fgraph, node):
op_outs,
nw_info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
......@@ -758,7 +755,6 @@ def add_nitsot_outputs(
new_scan_args.inner_outputs,
new_scan_args.info,
mode=old_scan_node.op.mode,
gpua=old_scan_node.op.gpua,
as_while=old_scan_node.op.as_while,
profile=old_scan_node.op.profile,
truncate_gradient=old_scan_node.op.truncate_gradient,
......@@ -909,10 +905,9 @@ class ScanInplaceOptimizer(GlobalOptimizer):
"""
def __init__(self, typeInfer=None, gpua_flag=False):
def __init__(self, typeInfer=None):
super().__init__()
self.typeInfer = typeInfer
self.gpua_flag = gpua_flag
def add_requirements(self, fgraph):
fgraph.attach_feature(ReplaceValidate())
......@@ -984,7 +979,6 @@ class ScanInplaceOptimizer(GlobalOptimizer):
op.info,
mode=op.mode,
typeConstructor=typeConstructor,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
......@@ -1016,9 +1010,7 @@ class ScanInplaceOptimizer(GlobalOptimizer):
alloc_ops = (Alloc, AllocEmpty)
nodes = fgraph.toposort()[::-1]
scan_nodes = [
x for x in nodes if (isinstance(x.op, Scan) and x.op.gpua == self.gpua_flag)
]
scan_nodes = [x for x in nodes if (isinstance(x.op, Scan))]
for scan_idx in range(len(scan_nodes)):
# First attempt to make the Scan compute inplace every recurrent
......@@ -1515,7 +1507,6 @@ def save_mem_new_scan(fgraph, node):
outs,
info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
......@@ -1812,7 +1803,6 @@ class ScanMerge(GlobalOptimizer):
truncate_gradient=old_op.truncate_gradient,
allow_gc=old_op.allow_gc,
name="&".join([nd.op.name for nd in nodes]),
gpua=False,
as_while=as_while,
)
new_outs = new_op(*outer_ins)
......@@ -1989,7 +1979,6 @@ def scan_merge_inouts(fgraph, node):
inner_outputs,
info,
mode=node.op.mode,
gpua=node.op.gpua,
as_while=node.op.as_while,
profile=node.op.profile,
truncate_gradient=node.op.truncate_gradient,
......@@ -2255,7 +2244,6 @@ def push_out_dot1_scan(fgraph, node):
new_inner_outs,
new_info,
mode=op.mode,
gpua=op.gpua,
as_while=op.as_while,
profile=op.profile,
truncate_gradient=op.truncate_gradient,
......
......@@ -78,7 +78,6 @@ def perform(
numpy.ndarray[numpy.int32_t,ndim=1] vector_outs,
tuple mit_mot_out_slices,
numpy.ndarray[numpy.int32_t,ndim=1] mitmots_preallocated,
numpy.ndarray[numpy.int32_t,ndim=1] inps_is_tensor,
numpy.ndarray[numpy.int32_t,ndim=1] outs_is_tensor,
list inner_input_storage,
list inner_output_storage,
......@@ -132,9 +131,6 @@ def perform(
tensor, 0 otherwise.
mit_mot_out_slices
Same as tap_array, but for the output taps of mit_mot sequences
inps_is_tensor : int32 ndarray (Can be replaced by a list)
Array of boolean indicating, for every input, whether it is a tensor
or not
outs_is_tensor : int32 ndarray (Can be replaced by a list)
Array of boolean indicating, for every output, whether it is a tensor
or not
......@@ -359,7 +355,7 @@ def perform(
pdx = offset + n_shared_outs
inner_output_storage[<unsigned int>pdx][0] = None
# 4.5. Keep a reference to the variables (ndarrays, GpuArrays,
# 4.5. Keep a reference to the variables (ndarrays,
# etc) currently in the inner_output_storage to be able to compare them
# with the actual outputs of the inner function after its
# execution. Also keep pointers to their data to be able to detect
......@@ -372,12 +368,10 @@ def perform(
if var is None:
old_output_data[idx] = None
elif outs_is_tensor[idx]:
old_output_data[idx] = var.data
else:
old_output_data[idx] = var.gpudata
old_output_data[idx] = var.data
# 4.6. Keep a reference to the variables (ndarrays, GpuArrays,
# 4.6. Keep a reference to the variables (ndarrays,
# etc) associated with mitmot inputs currently in the inner_input_storage to
# be able to compare them with the content of the inner_input_storage after
# the execution of the function. Also keep pointers to their data to
......@@ -389,10 +383,8 @@ def perform(
if var is None:
old_mitmot_input_data[idx] = None
elif inps_is_tensor[idx + n_seqs]:
old_mitmot_input_data[idx] = var.data
else:
old_mitmot_input_data[idx] = var.gpudata
old_mitmot_input_data[idx] = var.data
# 5.1 compute outputs
t0_fn = time.time()
......@@ -436,10 +428,7 @@ def perform(
new_var = inner_input_storage[n_seqs + inp_idx][0]
if old_var is new_var:
old_data = old_mitmot_input_data[inp_idx]
if inps_is_tensor[n_seqs + inp_idx]:
same_data = (new_var.data == old_data)
else:
same_data = (new_var.gpudata == old_data)
same_data = (new_var.data == old_data)
else:
same_data = False
......@@ -480,10 +469,8 @@ def perform(
if old_var is new_var:
if old_data is None:
output_reused = False
elif outs_is_tensor[offset_out + j]:
output_reused = (new_var.data == old_data)
else:
output_reused = (new_var.gpudata == old_data)
output_reused = (new_var.data == old_data)
else:
output_reused = False
......@@ -520,10 +507,8 @@ def perform(
if old_var is new_var:
if old_data is None:
output_reused = False
elif outs_is_tensor[offset_out + j]:
output_reused = (new_var.data == old_data)
else:
output_reused = (new_var.gpudata == old_data)
output_reused = (new_var.data == old_data)
else:
output_reused = False
......
......@@ -192,11 +192,6 @@ def traverse(out, x, x_copy, d, visited=None):
There are two options :
1) x and x_copy or on host, then you would replace x with x_copy
2) x is on gpu, x_copy on host, then you need to replace
host_from_gpu(x) with x_copy
This happens because initially shared variables are on GPU... which is
fine for the main computational graph but confuses things a bit for the
inner graph of scan.
"""
# ``visited`` is a set of nodes that are already known and don't need to be
......@@ -208,19 +203,14 @@ def traverse(out, x, x_copy, d, visited=None):
if out in visited:
return d
visited.add(out)
from aesara.gpuarray import pygpu_activated
from aesara.gpuarray.basic_ops import GpuFromHost, host_from_gpu
from aesara.gpuarray.type import GpuArrayType
if out == x:
assert isinstance(x.type, GpuArrayType)
d[out] = GpuFromHost(x.type.context_name)(x_copy)
return d
# assert isinstance(x.type, GpuArrayType)
# d[out] = GpuFromHost(x.type.context_name)(x_copy)
# return d
raise RuntimeError("Not supported")
elif out.owner is None:
return d
elif pygpu_activated and out.owner.op == host_from_gpu and out.owner.inputs == [x]:
d[out] = at.as_tensor_variable(x_copy)
return d
else:
for inp in out.owner.inputs:
d = traverse(inp, x, x_copy, d, visited)
......
......@@ -15,7 +15,6 @@ There are four kinds of BLAS Ops in Aesara:
- Python implementations (this file)
- SciPy-based (blas_scipy)
- C-based (blas_c)
- GPU-based (aesara.gpuarray)
Notes
-----
......
......@@ -865,7 +865,7 @@ class Subtensor(COp):
):
"""
The parameters c_prefix are there to allow reusing this
function on PyArray and GpuArray object.
function on PyArray object.
This fct take as input the x.
......@@ -1581,9 +1581,7 @@ class IncSubtensor(COp):
# This method delegates much of the work to helper
# methods. This method implements the main logic
# but subclasses may override the helper methods
# to change the particulars, e.g. GpuIncSubtensor
# turns the view/copy operations on numpy arrays
# into the same operations on gpu arrays.
# to change the particulars.
self.do_type_checking(node)
......
......@@ -23,9 +23,6 @@ dependencies:
# numba backend
- numba>=0.55
- numba-scipy
# GPU
- libgpuarray
- pygpu
# For testing
- coveralls
- diff-cover
......
......@@ -17,12 +17,6 @@ per-file-ignores =
tests/sparse/test_utils.py:E402,F401
tests/sparse/sandbox/test_sp.py:E402,F401
tests/scalar/test_basic_sympy.py:E402
tests/gpuarray/test_type.py:E402
tests/gpuarray/test_abstractconv.py:E402
tests/gpuarray/test_dnn.py:E402
tests/gpuarray/test_elemwise.py:E402
tests/gpuarray/test_others.py:E402
tests/gpuarray/test_basic_ops.py:E402
aesara/graph/unify.py:F811
exclude =
versioneer.py
......@@ -32,7 +26,6 @@ exclude =
[coverage:run]
omit =
aesara/_version.py
aesara/gpuarray/*
tests/*
aesara/assert_op.py
aesara/link/jax/jax_linker.py
......@@ -45,7 +38,6 @@ relative_files = true
[coverage:report]
omit =
aesara/_version.py
aesara/gpuarray/*
tests/*
exclude_lines =
pragma: no cover
......@@ -111,10 +103,6 @@ check_untyped_defs = False
ignore_errors = True
check_untyped_defs = False
[mypy-aesara.gpuarray.*]
ignore_errors = True
check_untyped_defs = False
[mypy-aesara.compile.mode]
ignore_errors = True
check_untyped_defs = False
......
......@@ -120,7 +120,6 @@ if __name__ == "__main__":
"symbolic",
"blas",
"numpy",
"gpu",
"autodiff",
"differentiation",
]
......
import copy
import pickle
import time
import numpy as np
import pytest
import aesara.gpuarray
import aesara.tensor as at
from aesara.compile import shared
from aesara.compile.debugmode import DebugMode, InvalidValueError
......@@ -14,8 +12,6 @@ from aesara.compile.function.types import UnusedInputError
from aesara.compile.io import In, Out
from aesara.compile.mode import Mode, get_default_mode
from aesara.configdefaults import config
from aesara.gpuarray import gpuarray_shared_constructor
from aesara.gpuarray.blas import GpuGemm
from aesara.graph.basic import Constant
from aesara.graph.opt import OpKeyOptimizer, PatternSub
from aesara.graph.utils import MissingInputError
......@@ -1146,76 +1142,3 @@ def test_empty_givens_updates():
y = x * 2
function([In(x)], y, givens={})
function([In(x)], y, updates={})
@pytest.mark.skipif(
not aesara.gpuarray.pygpu_activated or config.mode == "DEBUG_MODE",
reason="DEBUG_MODE forces synchronous behaviour which breaks this test",
)
def test_sync_update():
# This test if sync_update work. This can only be tested when
# there is a GPU. To test if we really sync, we compare a case we
# can run in parallel GPU and CPU computation. Then we sync to
# disable that parallel computation. Then we assert the time is
# higher.
# this import needs to go first because it generates the
# local 'aesara' variable. You get an UnboundLocalError otherwise.
import tests.gpuarray.config
sizes = [100, 500, 1000, 2000, 5000, 10000, 20000, 40000]
size = sizes[0]
w = gpuarray_shared_constructor(
np.random.rand(size, size).astype("float32"),
"w",
target=tests.gpuarray.config.test_ctx_name,
)
x = gpuarray_shared_constructor(
np.random.rand(size, size).astype("float32"),
"x",
target=tests.gpuarray.config.test_ctx_name,
)
updates = [(w, w + np.asarray(0.001, "float32") * dot(x, x))]
f = function([], updates=updates, mode=tests.gpuarray.config.mode_with_gpu)
assert len(f.maker.fgraph.apply_nodes) == 1
assert any(isinstance(n.op, GpuGemm) for n in f.maker.fgraph.apply_nodes)
# Make sure libgpuarray have compile all kernels
f()
f.sync_shared()
# Find a good size that will take about .5s.
# This is to make the test more stable across different GPUs.
size = sizes[-1]
for i in sizes:
data = np.random.rand(i, i).astype("float32")
w.set_value(data)
x.set_value(data)
t0 = time.time()
f()
f.sync_shared()
t1 = time.time()
if (t1 - t0) < 0.5:
continue
size = i
break
# sync to make sure all computation are done
f.sync_shared()
t_0 = time.time()
for i in range(3):
f()
# Sync after each call to see the slowdown from sync.
f.sync_shared()
time.sleep(0.5)
t_1 = time.time()
for i in range(3):
f()
time.sleep(0.5)
f.sync_shared()
# Sync to make sure all computation are finished.
t_2 = time.time()
d1 = t_1 - t_0
d2 = t_2 - t_1
assert d1 > d2, (d1, d2)
......@@ -243,7 +243,6 @@ def makeSharedTester(
assert x is not get_x
assert np.allclose(self.ref_fct(np.asarray(x_orig) / 0.5), self.ref_fct(x))
# test optimized get set value on the gpu(don't pass data to the cpu)
get_x = x_shared.get_value(borrow=True, return_internal_type=True)
assert get_x is not x_orig # borrow=False to shared_constructor
assert self.check_internal_type(get_x)
......@@ -325,8 +324,6 @@ def makeSharedTester(
if x.__class__.__name__ != "csr_matrix":
# sparse matrix don't support inplace affectation
nd += 1
# THIS DOESN'T DO WHAT WE EXPECT the content of a is
# not updated for GpuArray, but it is for ndarray
x_shared.get_value(borrow=True)[:] = nd
assert may_share_memory(old_data, x_shared.container.storage[0])
x_shared.get_value(borrow=True)
......@@ -345,7 +342,6 @@ def makeSharedTester(
)
# Test by set_value with borrow=False when new data cast.
# specifically useful for gpu data
nd += 1
old_data = x_shared.container.storage[0]
x_shared.set_value(self.cast_value(nd), borrow=False)
......@@ -522,8 +518,7 @@ def makeSharedTester(
assert (
sum(
[
node.op.__class__.__name__
in ["Gemm", "GpuGemm", "StructuredDot"]
node.op.__class__.__name__ in ["Gemm", "StructuredDot"]
for node in topo
]
)
......@@ -534,11 +529,6 @@ def makeSharedTester(
for node in topo
if isinstance(node.op, aesara.tensor.blas.Gemm)
)
assert all(
node.op.inplace
for node in topo
if node.op.__class__.__name__ == "GpuGemm"
)
# Their is no inplace gemm for sparse
# assert all(node.op.inplace for node in topo if node.op.__class__.__name__ == "StructuredDot")
s_shared_specify = specify_shape(
......@@ -560,8 +550,7 @@ def makeSharedTester(
assert (
sum(
[
node.op.__class__.__name__
in ["Gemm", "GpuGemm", "StructuredDot"]
node.op.__class__.__name__ in ["Gemm", "StructuredDot"]
for node in topo
]
)
......@@ -572,11 +561,7 @@ def makeSharedTester(
for node in topo
if isinstance(node.op, aesara.tensor.blas.Gemm)
)
assert all(
node.op.inplace
for node in topo
if node.op.__class__.__name__ == "GpuGemm"
)
# now test with the specify shape op in the inputs and outputs
a_shared = specify_shape(a_shared, a_shared.get_value(borrow=True).shape)
b_shared = specify_shape(b_shared, b_shared.get_value(borrow=True).shape)
......@@ -595,8 +580,7 @@ def makeSharedTester(
assert (
sum(
[
node.op.__class__.__name__
in ["Gemm", "GpuGemm", "StructuredDot"]
node.op.__class__.__name__ in ["Gemm", "StructuredDot"]
for node in topo
]
)
......@@ -607,11 +591,6 @@ def makeSharedTester(
for node in topo
if isinstance(node.op, aesara.tensor.blas.Gemm)
)
assert all(
node.op.inplace
for node in topo
if node.op.__class__.__name__ == "GpuGemm"
)
if (
aesara.config.cycle_detection == "fast"
......
......@@ -150,8 +150,6 @@ class TestIfelse(utt.OptimizationTestMixin):
f = function(
[c, x, y], [self.cast_output(gx), self.cast_output(gy)], mode=self.mode
)
# There is only 2 of the 3 ifelse that are moved on the GPU.
# The one that stay on the CPU is for the shape.
self.assertFunctionContains(f, self.get_ifelse(1), min=2, max=3)
rng = np.random.default_rng(utt.fetch_seed())
......@@ -173,7 +171,6 @@ class TestIfelse(utt.OptimizationTestMixin):
assert np.all(np.asarray(gy0) == 1.0)
def test_grad_cast_input(self):
# Tests the gradient when both inputs are on the GPU.
x = vector("x", dtype=self.dtype)
y = vector("y", dtype=self.dtype)
c = iscalar("c")
......@@ -528,8 +525,7 @@ class TestIfelse(utt.OptimizationTestMixin):
assert str(res.owner).startswith("if{}")
res.owner.op.name = "name"
res.owner.op.as_view = True
res.owner.op.gpu = True
assert str(res.owner).startswith("if{name,inplace,gpu}")
assert str(res.owner).startswith("if{name,inplace}")
class IfElseIfElseIf(Op):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论