提交 0e3182d1 authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove gpuarray dependencies throughout the codebase

上级 2a5fc594
......@@ -17,7 +17,6 @@ repos:
aesara/compile/nanguardmode\.py|
aesara/graph/opt\.py|
aesara/tensor/var\.py|
aesara/gpuarray/opt\.py
)$
- id: check-merge-conflict
- repo: https://github.com/psf/black
......
Aesara is a Python library that allows you to define, optimize, and efficiently evaluate mathematical expressions involving multi-dimensional arrays. It is built on top of NumPy_. Aesara features:
* **tight integration with NumPy:** a similar interface to NumPy's. numpy.ndarrays are also used internally in Aesara-compiled functions.
* **transparent use of a GPU:** perform data-intensive computations up to 140x faster than on a CPU (support for float32 only).
* **efficient symbolic differentiation:** Aesara can compute derivatives for functions of one or many inputs.
* **speed and stability optimizations:** avoid nasty bugs when computing expressions such as log(1 + exp(x)) for large values of x.
* **dynamic C code generation:** evaluate expressions faster.
......
......@@ -144,16 +144,6 @@ from aesara.updates import OrderedUpdates
# isort: on
if (
config.device.startswith("cuda")
or config.device.startswith("opencl")
or config.init_gpu_device.startswith("cuda")
or config.init_gpu_device.startswith("opencl")
or config.contexts != ""
):
import aesara.gpuarray
def get_scalar_constant_value(v):
"""Return the constant scalar (i.e. 0-D) value underlying variable `v`.
......
......@@ -752,16 +752,6 @@ def _get_preallocated_maps(
Preallocate outputs in different memory layouts.
"""
# To avoid circular imports
from aesara.gpuarray import GpuArrayType
from aesara.tensor.type import TensorType
try:
import pygpu
except ImportError:
pass
# TODO: Sparse? Scalar does not really make sense.
# Do not preallocate memory for outputs that actually work inplace
......@@ -795,11 +785,12 @@ def _get_preallocated_maps(
# I'm not sure why it is legitimate, but there are tests about it.
# So, we cannot fill r_vals[r] with def_val yet, we have to wait
# until all output values are deepcopied.
from aesara.tensor import TensorType
for r in considered_outputs:
# There is no risk to overwrite inputs, since r does not work
# inplace.
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
reuse_outputs[r][...] = np.asarray(def_val).astype(r.type.dtype)
if reuse_outputs:
......@@ -812,7 +803,7 @@ def _get_preallocated_maps(
if "c_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
c_cont_outputs = {}
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
# Build a C-contiguous buffer
new_buf = r.type.value_zeros(r_vals[r].shape)
assert new_buf.flags["C_CONTIGUOUS"]
......@@ -829,13 +820,11 @@ def _get_preallocated_maps(
if "f_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
f_cont_outputs = {}
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
new_buf = np.zeros(
shape=r_vals[r].shape, dtype=r_vals[r].dtype, order="F"
)
new_buf[...] = def_val
if isinstance(r.type, GpuArrayType):
new_buf = pygpu.array(new_buf)
f_cont_outputs[r] = new_buf
......@@ -859,7 +848,7 @@ def _get_preallocated_maps(
max_ndim = 0
rev_out_broadcastable = []
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim
......@@ -874,7 +863,7 @@ def _get_preallocated_maps(
# Initial allocation
init_strided = {}
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
# Create a buffer twice as large in every dimension,
# except if broadcastable, or for dimensions above
# config.DebugMode__check_preallocated_output_ndim
......@@ -953,7 +942,7 @@ def _get_preallocated_maps(
name = f"wrong_size{tuple(shape_diff)}"
for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)):
if isinstance(r.type, TensorType):
r_shape_diff = shape_diff[: r.ndim]
out_shape = [
max((s + sd), 0)
......
......@@ -1097,13 +1097,8 @@ class Function:
return [i.variable for i in self.maker.inputs if i.implicit]
def sync_shared(self):
if hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated:
import pygpu
for i in self.maker.fgraph.update_mapping.values():
inp = self.input_storage[i]
if isinstance(inp.data, pygpu.gpuarray.GpuArray):
inp.data.sync()
# sync was needed on old gpu backend
pass
# pickling/deepcopy support for Function
......
......@@ -5,24 +5,11 @@ from io import StringIO
import numpy as np
import aesara
from aesara.compile.mode import Mode, get_mode
from aesara.compile.mode import Mode
from aesara.configdefaults import config
from aesara.tensor.math import abs as at_abs
from aesara.tensor.math import max as at_max
from aesara.tensor.math import min as at_min
from aesara.tensor.type import discrete_dtypes
try:
from pygpu.gpuarray import GpuArray
from aesara.gpuarray.type import GpuArrayType, _name_for_ctx
pygpu_available = True
except ImportError:
pygpu_available = False
logger = logging.getLogger("aesara.compile.nanguardmode")
......@@ -114,9 +101,6 @@ def contains_nan(arr, node=None, var=None):
return False
elif getattr(arr, "dtype", "") in discrete_dtypes:
return False
elif pygpu_available and isinstance(arr, GpuArray):
return np.isnan(f_gpua_min(arr.reshape(arr.size)))
return np.isnan(np.min(arr))
......@@ -149,36 +133,9 @@ def contains_inf(arr, node=None, var=None):
return False
elif getattr(arr, "dtype", "") in discrete_dtypes:
return False
elif pygpu_available and isinstance(arr, GpuArray):
return np.isinf(f_gpua_min(arr.reshape(arr.size))) or np.isinf(
f_gpua_max(arr.reshape(arr.size))
)
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
def f_compute(op):
def result(inp):
dtype = inp.dtype
ctx_name = _name_for_ctx(inp.context)
key = (dtype, ctx_name)
f = result.cache.get(key, None)
if f is None:
guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
mode = get_mode("FAST_RUN").including("gpuarray")
f = aesara.function([guard_in], op(guard_in), mode=mode, profile=False)
result.cache[key] = f
return f(inp)
result.cache = dict()
return result
f_gpua_min = f_compute(at_min)
f_gpua_max = f_compute(at_max)
f_gpua_absmax = f_compute(lambda x: at_max(at_abs(x)))
class NanGuardMode(Mode):
"""
A Aesara compilation Mode that makes the compiled function automatically
......@@ -252,8 +209,6 @@ class NanGuardMode(Mode):
err = False
if not _is_numeric_value(value, var):
err = False
elif pygpu_available and isinstance(value, GpuArray):
err = f_gpua_absmax(value.reshape(value.size)) > 1e10
else:
err = np.abs(value).max() > 1e10
if err:
......
......@@ -12,10 +12,8 @@ import atexit
import copy
import logging
import operator
import os
import sys
import time
import warnings
from collections import defaultdict
from typing import Dict, List
......@@ -279,40 +277,7 @@ class ProfileStats:
# param is called flag_time_thunks because most other attributes with time
# in the name are times *of* something, rather than configuration flags.
def __init__(
self, atexit_print=True, flag_time_thunks=None, gpu_checks=True, **kwargs
):
if (
gpu_checks
and (hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated)
and os.environ.get("CUDA_LAUNCH_BLOCKING", "0") != "1"
):
msg = (
"You are running the Aesara profiler with CUDA enabled."
" Aesara GPU ops execution is asynchronous by default."
" So by default, the profile is useless."
" You must set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
" synchronize the execution to get a meaningful profile."
)
if config.profile:
raise Exception(msg)
else:
warnings.warn(msg)
if (
config.profile
and gpu_checks
and hasattr(aesara, "gpuarray")
and aesara.gpuarray.pygpu_activated
and not config.profiling__ignore_first_call
):
warnings.warn(
"Aesara flag profiling__ignore_first_call is False. "
"This cause bad profiling result in the gpu "
"back-end, as sometimes we compile at the first call."
)
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
self.apply_callcount = {}
self.output_size = {}
# Keys are `(FunctionGraph, Variable)`
......@@ -543,8 +508,8 @@ class ProfileStats:
tot += t
ftot = tot * 100 / local_time
# Remove the useless start and end of the class name:
# "<class 'aesara.gpuarray.blas.GpuDot22'>" ->
# "aesara.gpuarray.blas.GpuDot22"
# "<class 'aesara.backend.blas.GpuDot22'>" ->
# "aesara.backend.blas.GpuDot22"
class_name = str(a)[8:-2][:maxlen]
print(
format_str
......@@ -922,8 +887,6 @@ class ProfileStats:
new allocation.
"""
from aesara.gpuarray import GpuArrayType
# Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0]
running_memory_size = [0, 0]
......@@ -973,10 +936,8 @@ class ProfileStats:
# allocated by the node
idx2 = 0
for out in node.outputs:
if isinstance(out.type, GpuArrayType):
cg = 1
else:
cg = 0
# NOTE: cg=1 was used for GPU
cg = 0
ins = None
if dmap and idx2 in dmap:
vidx = dmap[idx2]
......@@ -1021,10 +982,8 @@ class ProfileStats:
for ins in set(node.inputs):
assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen
if isinstance(ins.type, GpuArrayType):
cg = 1
else:
cg = 0
# NOTE: cg=1 was used for GPU
cg = 0
if (
dependencies[ins]
and ins not in fgraph.outputs
......@@ -1687,27 +1646,7 @@ class ProfileStats:
)
printed_tip = True
# tip 7
import aesara.gpuarray
import aesara.tensor.signal.pool as pool
from aesara.tensor.nnet.basic import LogSoftmax
for (fgraph, a) in self.apply_time:
node = a
if isinstance(node.op, pool.Pool):
if not aesara.gpuarray.dnn.dnn_present():
print(
"Install CuDNN to do pooling faster"
"this allows the operation to run on GPU"
)
printed_tip = True
if isinstance(node.op, LogSoftmax):
if not aesara.gpuarray.dnn.dnn_present():
print(
"Install CuDNN to do LogSoftmax faster"
"this allows the operation to run on GPU"
)
printed_tip = True
# tip 7 was about pool and log softmax on gpu using cudnn
if not printed_tip:
print(" Sorry, no tip for today.", file=file)
......
差异被折叠。
......@@ -456,15 +456,13 @@ class DeviceParam(ConfigParam):
)
def _apply(self, val):
if val == self.default or val.startswith("opencl") or val.startswith("cuda"):
return val
elif val.startswith("gpu"):
if val.startswith("opencl") or val.startswith("cuda") or val.startswith("gpu"):
raise ValueError(
"You are trying to use the old GPU back-end. "
"It was removed from Aesara. Use device=cuda* now. "
"See https://github.com/aesara-devs/aesara/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29 "
"for more information."
"It was removed from Aesara."
)
elif val == self.default:
return val
else:
raise ValueError(
'Invalid value ("{val}") for configuration '
......
......@@ -229,8 +229,8 @@ class Apply(Node):
List of `Variable` instances to use as inputs.
strict : bool
If ``True``, the type fields of all the inputs must be equal
to the current ones (or compatible, for instance `Tensor` /
`GpuArray` of the same dtype and broadcastable patterns,
to the current ones (or compatible, for instance `TensorType`
of the same dtype and broadcastable patterns,
in which case they will be converted into current `Type`), and
returned outputs are guaranteed to have the same types as
``self.outputs``. If ``False``, then there's no guarantee that the
......@@ -328,9 +328,6 @@ class Variable(Node):
- `SparseVariable`: a subclass of `Variable` that represents
a ``scipy.sparse.{csc,csr}_matrix`` object.
- `GpuArrayVariable`: a subclass of `Variable` that represents our object on
the GPU that is a subset of ``numpy.ndarray``.
- `RandomVariable`.
A `Variable` which is the output of a symbolic computation will have an owner
......
......@@ -70,9 +70,9 @@ class IfElse(_NoPythonOp):
"""
__props__ = ("as_view", "gpu", "n_outs")
__props__ = ("as_view", "n_outs")
def __init__(self, n_outs, as_view=False, gpu=False, name=None):
def __init__(self, n_outs, as_view=False, name=None):
if as_view:
# check destroyhandler and others to ensure that a view_map with
# multiple inputs can work
......@@ -81,7 +81,6 @@ class IfElse(_NoPythonOp):
view_map[idx] = [idx + 1]
self.view_map = view_map
self.as_view = as_view
self.gpu = gpu
self.n_outs = n_outs
self.name = name
......@@ -90,14 +89,12 @@ class IfElse(_NoPythonOp):
return False
if self.as_view != other.as_view:
return False
if self.gpu != other.gpu:
return False
if self.n_outs != other.n_outs:
return False
return True
def __hash__(self):
return hash((type(self), self.as_view, self.gpu, self.n_outs))
return hash((type(self), self.as_view, self.n_outs))
def __str__(self):
args = []
......@@ -105,8 +102,6 @@ class IfElse(_NoPythonOp):
args.append(self.name)
if self.as_view:
args.append("inplace")
if self.gpu:
args.append("gpu")
return f"if{{{','.join(args)}}}"
def infer_shape(self, fgraph, node, inputs_shapes):
......@@ -143,7 +138,6 @@ class IfElse(_NoPythonOp):
new_ifelse = IfElse(
n_outs=len(new_ts_inputs),
as_view=False,
gpu=False,
name="_".join(name_tokens),
)
new_outs = new_ifelse(
......@@ -172,16 +166,13 @@ class IfElse(_NoPythonOp):
f"{int(2 * self.n_outs)}, got {len(args)}"
)
c = at.basic.as_tensor_variable(c)
if not self.gpu:
# When gpu is true, we are given only gpuarrays, and we want
# to keep them as gpuarrays
nw_args = []
for x in args:
if isinstance(x, Variable):
nw_args.append(x)
else:
nw_args.append(at.as_tensor_variable(x))
args = nw_args
nw_args = []
for x in args:
if isinstance(x, Variable):
nw_args.append(x)
else:
nw_args.append(at.as_tensor_variable(x))
args = nw_args
aes = args[: self.n_outs]
fs = args[self.n_outs :]
......@@ -214,13 +205,9 @@ class IfElse(_NoPythonOp):
else:
nw_name_t = None
nw_name_f = None
if_true_op = IfElse(
n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_t
)
if_true_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_t)
if_false_op = IfElse(
n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_f
)
if_false_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_f)
# The grads can have a different dtype then the inputs.
# As inputs true/false pair must have the same dtype,
......@@ -384,7 +371,7 @@ def ifelse(
f"{len(else_branch)})"
)
new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, gpu=False, name=name)
new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, name=name)
ins = [condition] + list(new_then_branch) + list(new_else_branch)
rval = new_ifelse(*ins, return_list=True)
......@@ -411,7 +398,7 @@ def cond_make_inplace(fgraph, node):
or not all(getattr(o.type, "ndim", -1) == 0 for o in node.outputs)
)
):
return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(
return IfElse(n_outs=op.n_outs, as_view=True, name=op.name)(
*node.inputs, return_list=True
)
return False
......@@ -611,7 +598,6 @@ class CondMerge(GlobalOptimizer):
new_ifelse = IfElse(
n_outs=len(mn_ts + pl_ts),
as_view=False,
gpu=False,
name=mn_name + "&" + pl_name,
)
new_outs = new_ifelse(*new_ins, return_list=True)
......@@ -660,7 +646,7 @@ def cond_remove_identical(fgraph, node):
nw_ts.append(aes[idx])
nw_fs.append(fs[idx])
new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, gpu=op.gpu, name=op.name)
new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, name=op.name)
new_ins = [node.inputs[0]] + nw_ts + nw_fs
new_outs = new_ifelse(*new_ins, return_list=True)
......@@ -712,7 +698,6 @@ def cond_merge_random_op(fgraph, main_node):
new_ifelse = IfElse(
n_outs=len(mn_ts + pl_ts),
as_view=False,
gpu=False,
name=mn_name + "&" + pl_name,
)
new_outs = new_ifelse(*new_ins, return_list=True)
......
......@@ -790,9 +790,6 @@ class ModuleCache:
if subdirs_elem == "lock_dir":
continue
root = os.path.join(self.dirname, subdirs_elem)
# Don't delete the gpuarray kernel cache
if root == config.gpuarray__cache_path:
continue
key_pkl = os.path.join(root, "key.pkl")
if key_pkl in self.loaded_key_pkl:
continue
......
......@@ -496,8 +496,6 @@ class CLinkerType(CLinkerObject):
e.g:
- For ``TensorType(dtype='int64', ...)``: should return ``"npy_int64"``.
- For ``GpuArrayType(dtype='int32', ...)``: should return ``"ga_int"``.
"""
return ""
......
......@@ -7,7 +7,7 @@ used to create a Params object that is compatible with the ParamsType defined.
The Params object will be available in both Python code (as a standard Python object) and C code
(as a specific struct with parameters as struct fields). To be fully-available in C code, Aesara
types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType, GpuArrayType,
types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType,
or your own type. See :ref:`extending_op_params` for more details).
Example of usage
......@@ -318,9 +318,8 @@ class Params(dict):
class ParamsType(CType):
"""
This class can create a struct of Aesara types (like `TensorType`,
`GpuArrayType`, etc.) to be used as a convenience op parameter wrapping
many data.
This class can create a struct of Aesara types (like `TensorType`, etc.)
to be used as a convenience `Op` parameter wrapping many data.
`ParamsType` constructor takes key-value args. Key will be the name of the
attribute in the struct. Value is the Aesara type of this attribute,
......
"""This script trigger convolution operation. We think it cause more
GPU power consumption then gemm call.
"""
import numpy as np
import aesara
from aesara.configdefaults import config
from aesara.gpuarray import dnn
from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
from aesara.tensor.type import tensor4
def burn():
sz = 128
img_shp = [sz, sz, sz, sz]
kern_shp = [sz // 2, sz, 3, 3]
out_shp = get_conv_output_shape(img_shp, kern_shp, "valid", (1, 1))
img = tensor4("img")
kern = tensor4("kern")
out = tensor4("out")
def rand(shp):
return np.random.rand(*shp).astype(config.floatX)
img = aesara.shared(rand(img_shp))
kern = aesara.shared(rand(kern_shp))
out = aesara.shared(rand(out_shp))
# beta 1 is needed to force the reuse of out, otherwise, it is
# replaced by a GpuAllocEmpty
o1 = dnn._dnn_conv(img, kern, conv_mode="conv", out=out, beta=1.0)
mode = aesara.compile.get_default_mode().including("local_remove_all_assert")
f = aesara.function([], [o1], mode=mode)
aesara.printing.debugprint(f)
print("Start computation")
for i in range(10000):
f.fn()
print("Computation stopped")
if __name__ == "__main__":
burn()
......@@ -78,12 +78,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
f() # Ignore first function call to get representative time.
if execute:
try:
from aesara.gpuarray import GpuArraySharedVariable
sync = isinstance(c, GpuArraySharedVariable)
except ImportError:
sync = False
# sync was needed for gpu
sync = False
if sync:
# Make sure we don't include the time from the first call
......
#! /usr/bin/env python
"""
This file compare the runtime of two independent dot products on one
and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import threading
import time
import numpy as np
import aesara
from aesara.gpuarray import init_dev
from aesara.gpuarray.blas import gpu_dot22
def main(dev1, dev2):
init_dev(dev1, "ctx1")
init_dev(dev2, "ctx2")
size = 1024 * 16
data = np.random.randn(size, size).astype("float32")
val1a = aesara.shared(data, target="ctx1")
val1b = aesara.shared(data, target="ctx1")
val1c = aesara.shared(data, target="ctx1")
val1d = aesara.shared(data, target="ctx1")
val2a = aesara.shared(data, target="ctx2")
val2b = aesara.shared(data, target="ctx2")
f1 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val1c, val1d)])
f2 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val2a, val2b)])
f3 = aesara.function([], [gpu_dot22(val1a, val1b)])
f4 = aesara.function([], [gpu_dot22(val2a, val2b)])
f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")])
f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")])
# pre-execute to load code to GPU.
r = f1.fn()
r[0].sync(), r[1].sync()
r = f2.fn()
r[0].sync(), r[1].sync()
r = f3.fn()
r[0].sync()
r = f4.fn()
r[0].sync()
r = f5.fn()
r = f6.fn()
r = None
t = time.time()
r = f1.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print(f"one ctx async {t2 - t:f}")
t = time.time()
r = f2.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print(f"two ctx async {t2 - t:f}")
t = time.time()
r = f3.fn()
r2 = f4.fn()
r[0].sync()
r2[0].sync()
t2 = time.time()
r = None
print(f"two ctx, 2 fct async {t2 - t:f}")
t = time.time()
r = f5.fn()
r2 = f6.fn()
t2 = time.time()
r = None
print(f"two ctx, 2 fct with transfer {t2 - t:f}")
# Multi-thread version
class myThread(threading.Thread):
def __init__(self, name, f, sync):
threading.Thread.__init__(self)
self.f = f
self.name = name
self.sync = sync
def run(self):
# print "Starting " + self.name
# r = self.f.fn(n_calls=10)
r = self.f()
# print "End " + self.name
if self.sync:
r[0].sync()
self.r = r
# print "Exiting " + self.name
thread1 = myThread("Thread-3", f3, True)
thread2 = myThread("Thread-4", f4, True)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}")
thread1 = myThread("Thread-5", f5, False)
thread2 = myThread("Thread-6", f6, False)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
raise ValueError("This script require two device names.")
main(sys.argv[1], sys.argv[2])
"""
Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
Function to detect memory sharing for ndarray AND sparse type.
numpy version support only ndarray.
"""
......@@ -18,48 +18,22 @@ try:
return scipy.sparse.issparse(a)
except ImportError:
# scipy not imported, their can be only ndarray and gpuarray
def _is_sparse(a):
return False
from aesara import gpuarray
if gpuarray.pygpu:
def _is_gpua(a):
return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
else:
def _is_gpua(a):
def _is_sparse(a):
return False
__docformat__ = "restructuredtext en"
def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, np.ndarray)
b_ndarray = isinstance(b, np.ndarray)
if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b)
a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b)
if a_gpua and b_gpua:
return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b)
if not (a_ndarray or a_sparse or a_gpua) or not (b_ndarray or b_sparse or b_gpua):
if not (a_ndarray or a_sparse) or not (b_ndarray or b_sparse):
if raise_other_type:
raise TypeError(
"may_share_memory support only ndarray"
" and scipy.sparse or GpuArray type"
)
raise TypeError("may_share_memory support only ndarray" " and scipy.sparse")
return False
if a_gpua or b_gpua:
return False
return SparseTensorType.may_share_memory(a, b)
......@@ -9,7 +9,6 @@ import os
import pickle
import sys
import tempfile
import warnings
import zipfile
from collections import defaultdict
from contextlib import closing
......@@ -27,7 +26,6 @@ except ImportError:
DEFAULT_PROTOCOL = HIGHEST_PROTOCOL
from aesara.compile.sharedvalue import SharedVariable
from aesara.configdefaults import config
__docformat__ = "restructuredtext en"
......@@ -121,30 +119,7 @@ class PersistentNdarrayID:
return self.seen[id(obj)]
class PersistentGpuArrayID(PersistentNdarrayID):
def __call__(self, obj):
from aesara.gpuarray.type import _name_for_ctx
try:
import pygpu
except ImportError:
pygpu = None
if pygpu and isinstance(obj, pygpu.gpuarray.GpuArray):
if id(obj) not in self.seen:
def write_array(f):
pickle.dump(_name_for_ctx(obj.context), f, 2)
np.lib.format.write_array(f, np.asarray(obj))
name = self._resolve_name(obj)
zipadd(write_array, self.zip_file, name)
self.seen[id(obj)] = f"gpuarray.{name}"
return self.seen[id(obj)]
return super().__call__(obj)
class PersistentSharedVariableID(PersistentGpuArrayID):
class PersistentSharedVariableID(PersistentNdarrayID):
"""Uses shared variable names when persisting to zip file.
If a shared variable has a name, this name is used as the name of the
......@@ -213,32 +188,16 @@ class PersistentNdarrayLoad:
self.cache = {}
def __call__(self, persid):
from aesara.gpuarray import pygpu
from aesara.gpuarray.type import get_context
array_type, name = persid.split(".")
del array_type
# array_type was used for switching gpu/cpu arrays
# it is better to put these into sublclasses properly
# this is more work but better logic
if name in self.cache:
return self.cache[name]
ret = None
if array_type == "gpuarray":
with self.zip_file.open(name) as f:
ctx_name = pickle.load(f)
array = np.lib.format.read_array(f)
if config.experimental__unpickle_gpu_on_cpu:
# directly return numpy array
warnings.warn(
"config.experimental__unpickle_gpu_on_cpu is set "
"to True. Unpickling GpuArray as numpy.ndarray"
)
ret = array
elif pygpu:
ret = pygpu.array(array, context=get_context(ctx_name))
else:
raise ImportError("pygpu not found. Cannot unpickle GpuArray")
else:
with self.zip_file.open(name) as f:
ret = np.lib.format.read_array(f)
with self.zip_file.open(name) as f:
ret = np.lib.format.read_array(f)
self.cache[name] = ret
return ret
......
......@@ -12,7 +12,7 @@ from aesara.graph.op import get_test_value
from aesara.graph.utils import MissingInputError, TestValueError
from aesara.scan import utils
from aesara.scan.op import Scan, ScanInfo
from aesara.scan.utils import safe_new, traverse
from aesara.scan.utils import safe_new
from aesara.tensor.exceptions import NotScalarConstantError
from aesara.tensor.math import minimum
from aesara.tensor.shape import shape_padleft
......@@ -968,29 +968,8 @@ def scan(
)
if condition is not None:
inner_outs.append(condition)
# gpuarray is imported here, instead of being imported on top of
# the file because that would force on the user some dependencies that we
# might do not want to. Currently we are working on removing the
# dependencies on sandbox code completely.
from aesara import gpuarray
if gpuarray.pygpu_activated:
# very often we end up in this situation when we want to
# replace w with w_copy, where w is a GPU variable
# and w_copy is TensorType. This is caused because shared
# variables are put on GPU right away >:| ,
new_givens = OrderedDict()
for w, w_copy in givens.items():
if isinstance(w.type, gpuarray.GpuArrayType) and isinstance(
w_copy.type, TensorType
):
for o in inner_outs:
new_givens = traverse(o, w, w_copy, new_givens)
else:
new_givens[w] = w_copy
else:
new_givens = givens
# NOTE: legacy code traversed GPU types
new_givens = givens
new_outs = clone_replace(inner_outs, replace=new_givens)
......@@ -1023,7 +1002,6 @@ def scan(
mode=mode,
truncate_gradient=truncate_gradient,
name=name,
gpua=False,
as_while=as_while,
profile=profile,
allow_gc=allow_gc,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论