提交 0e3182d1 authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove gpuarray dependencies throughout the codebase

上级 2a5fc594
...@@ -17,7 +17,6 @@ repos: ...@@ -17,7 +17,6 @@ repos:
aesara/compile/nanguardmode\.py| aesara/compile/nanguardmode\.py|
aesara/graph/opt\.py| aesara/graph/opt\.py|
aesara/tensor/var\.py| aesara/tensor/var\.py|
aesara/gpuarray/opt\.py
)$ )$
- id: check-merge-conflict - id: check-merge-conflict
- repo: https://github.com/psf/black - repo: https://github.com/psf/black
......
Aesara is a Python library that allows you to define, optimize, and efficiently evaluate mathematical expressions involving multi-dimensional arrays. It is built on top of NumPy_. Aesara features: Aesara is a Python library that allows you to define, optimize, and efficiently evaluate mathematical expressions involving multi-dimensional arrays. It is built on top of NumPy_. Aesara features:
* **tight integration with NumPy:** a similar interface to NumPy's. numpy.ndarrays are also used internally in Aesara-compiled functions. * **tight integration with NumPy:** a similar interface to NumPy's. numpy.ndarrays are also used internally in Aesara-compiled functions.
* **transparent use of a GPU:** perform data-intensive computations up to 140x faster than on a CPU (support for float32 only).
* **efficient symbolic differentiation:** Aesara can compute derivatives for functions of one or many inputs. * **efficient symbolic differentiation:** Aesara can compute derivatives for functions of one or many inputs.
* **speed and stability optimizations:** avoid nasty bugs when computing expressions such as log(1 + exp(x)) for large values of x. * **speed and stability optimizations:** avoid nasty bugs when computing expressions such as log(1 + exp(x)) for large values of x.
* **dynamic C code generation:** evaluate expressions faster. * **dynamic C code generation:** evaluate expressions faster.
......
...@@ -144,16 +144,6 @@ from aesara.updates import OrderedUpdates ...@@ -144,16 +144,6 @@ from aesara.updates import OrderedUpdates
# isort: on # isort: on
if (
config.device.startswith("cuda")
or config.device.startswith("opencl")
or config.init_gpu_device.startswith("cuda")
or config.init_gpu_device.startswith("opencl")
or config.contexts != ""
):
import aesara.gpuarray
def get_scalar_constant_value(v): def get_scalar_constant_value(v):
"""Return the constant scalar (i.e. 0-D) value underlying variable `v`. """Return the constant scalar (i.e. 0-D) value underlying variable `v`.
......
...@@ -752,16 +752,6 @@ def _get_preallocated_maps( ...@@ -752,16 +752,6 @@ def _get_preallocated_maps(
Preallocate outputs in different memory layouts. Preallocate outputs in different memory layouts.
""" """
# To avoid circular imports
from aesara.gpuarray import GpuArrayType
from aesara.tensor.type import TensorType
try:
import pygpu
except ImportError:
pass
# TODO: Sparse? Scalar does not really make sense. # TODO: Sparse? Scalar does not really make sense.
# Do not preallocate memory for outputs that actually work inplace # Do not preallocate memory for outputs that actually work inplace
...@@ -795,11 +785,12 @@ def _get_preallocated_maps( ...@@ -795,11 +785,12 @@ def _get_preallocated_maps(
# I'm not sure why it is legitimate, but there are tests about it. # I'm not sure why it is legitimate, but there are tests about it.
# So, we cannot fill r_vals[r] with def_val yet, we have to wait # So, we cannot fill r_vals[r] with def_val yet, we have to wait
# until all output values are deepcopied. # until all output values are deepcopied.
from aesara.tensor import TensorType
for r in considered_outputs: for r in considered_outputs:
# There is no risk to overwrite inputs, since r does not work # There is no risk to overwrite inputs, since r does not work
# inplace. # inplace.
if isinstance(r.type, (TensorType, GpuArrayType)): if isinstance(r.type, TensorType):
reuse_outputs[r][...] = np.asarray(def_val).astype(r.type.dtype) reuse_outputs[r][...] = np.asarray(def_val).astype(r.type.dtype)
if reuse_outputs: if reuse_outputs:
...@@ -812,7 +803,7 @@ def _get_preallocated_maps( ...@@ -812,7 +803,7 @@ def _get_preallocated_maps(
if "c_contiguous" in prealloc_modes or "ALL" in prealloc_modes: if "c_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
c_cont_outputs = {} c_cont_outputs = {}
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)): if isinstance(r.type, TensorType):
# Build a C-contiguous buffer # Build a C-contiguous buffer
new_buf = r.type.value_zeros(r_vals[r].shape) new_buf = r.type.value_zeros(r_vals[r].shape)
assert new_buf.flags["C_CONTIGUOUS"] assert new_buf.flags["C_CONTIGUOUS"]
...@@ -829,13 +820,11 @@ def _get_preallocated_maps( ...@@ -829,13 +820,11 @@ def _get_preallocated_maps(
if "f_contiguous" in prealloc_modes or "ALL" in prealloc_modes: if "f_contiguous" in prealloc_modes or "ALL" in prealloc_modes:
f_cont_outputs = {} f_cont_outputs = {}
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)): if isinstance(r.type, TensorType):
new_buf = np.zeros( new_buf = np.zeros(
shape=r_vals[r].shape, dtype=r_vals[r].dtype, order="F" shape=r_vals[r].shape, dtype=r_vals[r].dtype, order="F"
) )
new_buf[...] = def_val new_buf[...] = def_val
if isinstance(r.type, GpuArrayType):
new_buf = pygpu.array(new_buf)
f_cont_outputs[r] = new_buf f_cont_outputs[r] = new_buf
...@@ -859,7 +848,7 @@ def _get_preallocated_maps( ...@@ -859,7 +848,7 @@ def _get_preallocated_maps(
max_ndim = 0 max_ndim = 0
rev_out_broadcastable = [] rev_out_broadcastable = []
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)): if isinstance(r.type, TensorType):
if max_ndim < r.ndim: if max_ndim < r.ndim:
rev_out_broadcastable += [True] * (r.ndim - max_ndim) rev_out_broadcastable += [True] * (r.ndim - max_ndim)
max_ndim = r.ndim max_ndim = r.ndim
...@@ -874,7 +863,7 @@ def _get_preallocated_maps( ...@@ -874,7 +863,7 @@ def _get_preallocated_maps(
# Initial allocation # Initial allocation
init_strided = {} init_strided = {}
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)): if isinstance(r.type, TensorType):
# Create a buffer twice as large in every dimension, # Create a buffer twice as large in every dimension,
# except if broadcastable, or for dimensions above # except if broadcastable, or for dimensions above
# config.DebugMode__check_preallocated_output_ndim # config.DebugMode__check_preallocated_output_ndim
...@@ -953,7 +942,7 @@ def _get_preallocated_maps( ...@@ -953,7 +942,7 @@ def _get_preallocated_maps(
name = f"wrong_size{tuple(shape_diff)}" name = f"wrong_size{tuple(shape_diff)}"
for r in considered_outputs: for r in considered_outputs:
if isinstance(r.type, (TensorType, GpuArrayType)): if isinstance(r.type, TensorType):
r_shape_diff = shape_diff[: r.ndim] r_shape_diff = shape_diff[: r.ndim]
out_shape = [ out_shape = [
max((s + sd), 0) max((s + sd), 0)
......
...@@ -1097,13 +1097,8 @@ class Function: ...@@ -1097,13 +1097,8 @@ class Function:
return [i.variable for i in self.maker.inputs if i.implicit] return [i.variable for i in self.maker.inputs if i.implicit]
def sync_shared(self): def sync_shared(self):
if hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated: # sync was needed on old gpu backend
import pygpu pass
for i in self.maker.fgraph.update_mapping.values():
inp = self.input_storage[i]
if isinstance(inp.data, pygpu.gpuarray.GpuArray):
inp.data.sync()
# pickling/deepcopy support for Function # pickling/deepcopy support for Function
......
...@@ -5,24 +5,11 @@ from io import StringIO ...@@ -5,24 +5,11 @@ from io import StringIO
import numpy as np import numpy as np
import aesara import aesara
from aesara.compile.mode import Mode, get_mode from aesara.compile.mode import Mode
from aesara.configdefaults import config from aesara.configdefaults import config
from aesara.tensor.math import abs as at_abs
from aesara.tensor.math import max as at_max
from aesara.tensor.math import min as at_min
from aesara.tensor.type import discrete_dtypes from aesara.tensor.type import discrete_dtypes
try:
from pygpu.gpuarray import GpuArray
from aesara.gpuarray.type import GpuArrayType, _name_for_ctx
pygpu_available = True
except ImportError:
pygpu_available = False
logger = logging.getLogger("aesara.compile.nanguardmode") logger = logging.getLogger("aesara.compile.nanguardmode")
...@@ -114,9 +101,6 @@ def contains_nan(arr, node=None, var=None): ...@@ -114,9 +101,6 @@ def contains_nan(arr, node=None, var=None):
return False return False
elif getattr(arr, "dtype", "") in discrete_dtypes: elif getattr(arr, "dtype", "") in discrete_dtypes:
return False return False
elif pygpu_available and isinstance(arr, GpuArray):
return np.isnan(f_gpua_min(arr.reshape(arr.size)))
return np.isnan(np.min(arr)) return np.isnan(np.min(arr))
...@@ -149,36 +133,9 @@ def contains_inf(arr, node=None, var=None): ...@@ -149,36 +133,9 @@ def contains_inf(arr, node=None, var=None):
return False return False
elif getattr(arr, "dtype", "") in discrete_dtypes: elif getattr(arr, "dtype", "") in discrete_dtypes:
return False return False
elif pygpu_available and isinstance(arr, GpuArray):
return np.isinf(f_gpua_min(arr.reshape(arr.size))) or np.isinf(
f_gpua_max(arr.reshape(arr.size))
)
return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr)) return np.isinf(np.nanmax(arr)) or np.isinf(np.nanmin(arr))
def f_compute(op):
def result(inp):
dtype = inp.dtype
ctx_name = _name_for_ctx(inp.context)
key = (dtype, ctx_name)
f = result.cache.get(key, None)
if f is None:
guard_in = GpuArrayType(str(dtype), (False,), context_name=ctx_name)()
mode = get_mode("FAST_RUN").including("gpuarray")
f = aesara.function([guard_in], op(guard_in), mode=mode, profile=False)
result.cache[key] = f
return f(inp)
result.cache = dict()
return result
f_gpua_min = f_compute(at_min)
f_gpua_max = f_compute(at_max)
f_gpua_absmax = f_compute(lambda x: at_max(at_abs(x)))
class NanGuardMode(Mode): class NanGuardMode(Mode):
""" """
A Aesara compilation Mode that makes the compiled function automatically A Aesara compilation Mode that makes the compiled function automatically
...@@ -252,8 +209,6 @@ class NanGuardMode(Mode): ...@@ -252,8 +209,6 @@ class NanGuardMode(Mode):
err = False err = False
if not _is_numeric_value(value, var): if not _is_numeric_value(value, var):
err = False err = False
elif pygpu_available and isinstance(value, GpuArray):
err = f_gpua_absmax(value.reshape(value.size)) > 1e10
else: else:
err = np.abs(value).max() > 1e10 err = np.abs(value).max() > 1e10
if err: if err:
......
...@@ -12,10 +12,8 @@ import atexit ...@@ -12,10 +12,8 @@ import atexit
import copy import copy
import logging import logging
import operator import operator
import os
import sys import sys
import time import time
import warnings
from collections import defaultdict from collections import defaultdict
from typing import Dict, List from typing import Dict, List
...@@ -279,40 +277,7 @@ class ProfileStats: ...@@ -279,40 +277,7 @@ class ProfileStats:
# param is called flag_time_thunks because most other attributes with time # param is called flag_time_thunks because most other attributes with time
# in the name are times *of* something, rather than configuration flags. # in the name are times *of* something, rather than configuration flags.
def __init__( def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
self, atexit_print=True, flag_time_thunks=None, gpu_checks=True, **kwargs
):
if (
gpu_checks
and (hasattr(aesara, "gpuarray") and aesara.gpuarray.pygpu_activated)
and os.environ.get("CUDA_LAUNCH_BLOCKING", "0") != "1"
):
msg = (
"You are running the Aesara profiler with CUDA enabled."
" Aesara GPU ops execution is asynchronous by default."
" So by default, the profile is useless."
" You must set the environment variable"
" CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to"
" synchronize the execution to get a meaningful profile."
)
if config.profile:
raise Exception(msg)
else:
warnings.warn(msg)
if (
config.profile
and gpu_checks
and hasattr(aesara, "gpuarray")
and aesara.gpuarray.pygpu_activated
and not config.profiling__ignore_first_call
):
warnings.warn(
"Aesara flag profiling__ignore_first_call is False. "
"This cause bad profiling result in the gpu "
"back-end, as sometimes we compile at the first call."
)
self.apply_callcount = {} self.apply_callcount = {}
self.output_size = {} self.output_size = {}
# Keys are `(FunctionGraph, Variable)` # Keys are `(FunctionGraph, Variable)`
...@@ -543,8 +508,8 @@ class ProfileStats: ...@@ -543,8 +508,8 @@ class ProfileStats:
tot += t tot += t
ftot = tot * 100 / local_time ftot = tot * 100 / local_time
# Remove the useless start and end of the class name: # Remove the useless start and end of the class name:
# "<class 'aesara.gpuarray.blas.GpuDot22'>" -> # "<class 'aesara.backend.blas.GpuDot22'>" ->
# "aesara.gpuarray.blas.GpuDot22" # "aesara.backend.blas.GpuDot22"
class_name = str(a)[8:-2][:maxlen] class_name = str(a)[8:-2][:maxlen]
print( print(
format_str format_str
...@@ -922,8 +887,6 @@ class ProfileStats: ...@@ -922,8 +887,6 @@ class ProfileStats:
new allocation. new allocation.
""" """
from aesara.gpuarray import GpuArrayType
# Initial Mem info values [CPU, GPU] # Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0] node_memory_size = [0, 0]
running_memory_size = [0, 0] running_memory_size = [0, 0]
...@@ -973,10 +936,8 @@ class ProfileStats: ...@@ -973,10 +936,8 @@ class ProfileStats:
# allocated by the node # allocated by the node
idx2 = 0 idx2 = 0
for out in node.outputs: for out in node.outputs:
if isinstance(out.type, GpuArrayType): # NOTE: cg=1 was used for GPU
cg = 1 cg = 0
else:
cg = 0
ins = None ins = None
if dmap and idx2 in dmap: if dmap and idx2 in dmap:
vidx = dmap[idx2] vidx = dmap[idx2]
...@@ -1021,10 +982,8 @@ class ProfileStats: ...@@ -1021,10 +982,8 @@ class ProfileStats:
for ins in set(node.inputs): for ins in set(node.inputs):
assert not (ins in view_of and viewed_by[ins]) assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen # we trac the original var, so this shouldn't happen
if isinstance(ins.type, GpuArrayType): # NOTE: cg=1 was used for GPU
cg = 1 cg = 0
else:
cg = 0
if ( if (
dependencies[ins] dependencies[ins]
and ins not in fgraph.outputs and ins not in fgraph.outputs
...@@ -1687,27 +1646,7 @@ class ProfileStats: ...@@ -1687,27 +1646,7 @@ class ProfileStats:
) )
printed_tip = True printed_tip = True
# tip 7 # tip 7 was about pool and log softmax on gpu using cudnn
import aesara.gpuarray
import aesara.tensor.signal.pool as pool
from aesara.tensor.nnet.basic import LogSoftmax
for (fgraph, a) in self.apply_time:
node = a
if isinstance(node.op, pool.Pool):
if not aesara.gpuarray.dnn.dnn_present():
print(
"Install CuDNN to do pooling faster"
"this allows the operation to run on GPU"
)
printed_tip = True
if isinstance(node.op, LogSoftmax):
if not aesara.gpuarray.dnn.dnn_present():
print(
"Install CuDNN to do LogSoftmax faster"
"this allows the operation to run on GPU"
)
printed_tip = True
if not printed_tip: if not printed_tip:
print(" Sorry, no tip for today.", file=file) print(" Sorry, no tip for today.", file=file)
......
差异被折叠。
...@@ -456,15 +456,13 @@ class DeviceParam(ConfigParam): ...@@ -456,15 +456,13 @@ class DeviceParam(ConfigParam):
) )
def _apply(self, val): def _apply(self, val):
if val == self.default or val.startswith("opencl") or val.startswith("cuda"): if val.startswith("opencl") or val.startswith("cuda") or val.startswith("gpu"):
return val
elif val.startswith("gpu"):
raise ValueError( raise ValueError(
"You are trying to use the old GPU back-end. " "You are trying to use the old GPU back-end. "
"It was removed from Aesara. Use device=cuda* now. " "It was removed from Aesara."
"See https://github.com/aesara-devs/aesara/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29 "
"for more information."
) )
elif val == self.default:
return val
else: else:
raise ValueError( raise ValueError(
'Invalid value ("{val}") for configuration ' 'Invalid value ("{val}") for configuration '
......
...@@ -229,8 +229,8 @@ class Apply(Node): ...@@ -229,8 +229,8 @@ class Apply(Node):
List of `Variable` instances to use as inputs. List of `Variable` instances to use as inputs.
strict : bool strict : bool
If ``True``, the type fields of all the inputs must be equal If ``True``, the type fields of all the inputs must be equal
to the current ones (or compatible, for instance `Tensor` / to the current ones (or compatible, for instance `TensorType`
`GpuArray` of the same dtype and broadcastable patterns, of the same dtype and broadcastable patterns,
in which case they will be converted into current `Type`), and in which case they will be converted into current `Type`), and
returned outputs are guaranteed to have the same types as returned outputs are guaranteed to have the same types as
``self.outputs``. If ``False``, then there's no guarantee that the ``self.outputs``. If ``False``, then there's no guarantee that the
...@@ -328,9 +328,6 @@ class Variable(Node): ...@@ -328,9 +328,6 @@ class Variable(Node):
- `SparseVariable`: a subclass of `Variable` that represents - `SparseVariable`: a subclass of `Variable` that represents
a ``scipy.sparse.{csc,csr}_matrix`` object. a ``scipy.sparse.{csc,csr}_matrix`` object.
- `GpuArrayVariable`: a subclass of `Variable` that represents our object on
the GPU that is a subset of ``numpy.ndarray``.
- `RandomVariable`. - `RandomVariable`.
A `Variable` which is the output of a symbolic computation will have an owner A `Variable` which is the output of a symbolic computation will have an owner
......
...@@ -70,9 +70,9 @@ class IfElse(_NoPythonOp): ...@@ -70,9 +70,9 @@ class IfElse(_NoPythonOp):
""" """
__props__ = ("as_view", "gpu", "n_outs") __props__ = ("as_view", "n_outs")
def __init__(self, n_outs, as_view=False, gpu=False, name=None): def __init__(self, n_outs, as_view=False, name=None):
if as_view: if as_view:
# check destroyhandler and others to ensure that a view_map with # check destroyhandler and others to ensure that a view_map with
# multiple inputs can work # multiple inputs can work
...@@ -81,7 +81,6 @@ class IfElse(_NoPythonOp): ...@@ -81,7 +81,6 @@ class IfElse(_NoPythonOp):
view_map[idx] = [idx + 1] view_map[idx] = [idx + 1]
self.view_map = view_map self.view_map = view_map
self.as_view = as_view self.as_view = as_view
self.gpu = gpu
self.n_outs = n_outs self.n_outs = n_outs
self.name = name self.name = name
...@@ -90,14 +89,12 @@ class IfElse(_NoPythonOp): ...@@ -90,14 +89,12 @@ class IfElse(_NoPythonOp):
return False return False
if self.as_view != other.as_view: if self.as_view != other.as_view:
return False return False
if self.gpu != other.gpu:
return False
if self.n_outs != other.n_outs: if self.n_outs != other.n_outs:
return False return False
return True return True
def __hash__(self): def __hash__(self):
return hash((type(self), self.as_view, self.gpu, self.n_outs)) return hash((type(self), self.as_view, self.n_outs))
def __str__(self): def __str__(self):
args = [] args = []
...@@ -105,8 +102,6 @@ class IfElse(_NoPythonOp): ...@@ -105,8 +102,6 @@ class IfElse(_NoPythonOp):
args.append(self.name) args.append(self.name)
if self.as_view: if self.as_view:
args.append("inplace") args.append("inplace")
if self.gpu:
args.append("gpu")
return f"if{{{','.join(args)}}}" return f"if{{{','.join(args)}}}"
def infer_shape(self, fgraph, node, inputs_shapes): def infer_shape(self, fgraph, node, inputs_shapes):
...@@ -143,7 +138,6 @@ class IfElse(_NoPythonOp): ...@@ -143,7 +138,6 @@ class IfElse(_NoPythonOp):
new_ifelse = IfElse( new_ifelse = IfElse(
n_outs=len(new_ts_inputs), n_outs=len(new_ts_inputs),
as_view=False, as_view=False,
gpu=False,
name="_".join(name_tokens), name="_".join(name_tokens),
) )
new_outs = new_ifelse( new_outs = new_ifelse(
...@@ -172,16 +166,13 @@ class IfElse(_NoPythonOp): ...@@ -172,16 +166,13 @@ class IfElse(_NoPythonOp):
f"{int(2 * self.n_outs)}, got {len(args)}" f"{int(2 * self.n_outs)}, got {len(args)}"
) )
c = at.basic.as_tensor_variable(c) c = at.basic.as_tensor_variable(c)
if not self.gpu: nw_args = []
# When gpu is true, we are given only gpuarrays, and we want for x in args:
# to keep them as gpuarrays if isinstance(x, Variable):
nw_args = [] nw_args.append(x)
for x in args: else:
if isinstance(x, Variable): nw_args.append(at.as_tensor_variable(x))
nw_args.append(x) args = nw_args
else:
nw_args.append(at.as_tensor_variable(x))
args = nw_args
aes = args[: self.n_outs] aes = args[: self.n_outs]
fs = args[self.n_outs :] fs = args[self.n_outs :]
...@@ -214,13 +205,9 @@ class IfElse(_NoPythonOp): ...@@ -214,13 +205,9 @@ class IfElse(_NoPythonOp):
else: else:
nw_name_t = None nw_name_t = None
nw_name_f = None nw_name_f = None
if_true_op = IfElse( if_true_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_t)
n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_t
)
if_false_op = IfElse( if_false_op = IfElse(n_outs=self.n_outs, as_view=self.as_view, name=nw_name_f)
n_outs=self.n_outs, as_view=self.as_view, gpu=self.gpu, name=nw_name_f
)
# The grads can have a different dtype then the inputs. # The grads can have a different dtype then the inputs.
# As inputs true/false pair must have the same dtype, # As inputs true/false pair must have the same dtype,
...@@ -384,7 +371,7 @@ def ifelse( ...@@ -384,7 +371,7 @@ def ifelse(
f"{len(else_branch)})" f"{len(else_branch)})"
) )
new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, gpu=False, name=name) new_ifelse = IfElse(n_outs=len(then_branch), as_view=False, name=name)
ins = [condition] + list(new_then_branch) + list(new_else_branch) ins = [condition] + list(new_then_branch) + list(new_else_branch)
rval = new_ifelse(*ins, return_list=True) rval = new_ifelse(*ins, return_list=True)
...@@ -411,7 +398,7 @@ def cond_make_inplace(fgraph, node): ...@@ -411,7 +398,7 @@ def cond_make_inplace(fgraph, node):
or not all(getattr(o.type, "ndim", -1) == 0 for o in node.outputs) or not all(getattr(o.type, "ndim", -1) == 0 for o in node.outputs)
) )
): ):
return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)( return IfElse(n_outs=op.n_outs, as_view=True, name=op.name)(
*node.inputs, return_list=True *node.inputs, return_list=True
) )
return False return False
...@@ -611,7 +598,6 @@ class CondMerge(GlobalOptimizer): ...@@ -611,7 +598,6 @@ class CondMerge(GlobalOptimizer):
new_ifelse = IfElse( new_ifelse = IfElse(
n_outs=len(mn_ts + pl_ts), n_outs=len(mn_ts + pl_ts),
as_view=False, as_view=False,
gpu=False,
name=mn_name + "&" + pl_name, name=mn_name + "&" + pl_name,
) )
new_outs = new_ifelse(*new_ins, return_list=True) new_outs = new_ifelse(*new_ins, return_list=True)
...@@ -660,7 +646,7 @@ def cond_remove_identical(fgraph, node): ...@@ -660,7 +646,7 @@ def cond_remove_identical(fgraph, node):
nw_ts.append(aes[idx]) nw_ts.append(aes[idx])
nw_fs.append(fs[idx]) nw_fs.append(fs[idx])
new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, gpu=op.gpu, name=op.name) new_ifelse = IfElse(n_outs=len(nw_ts), as_view=op.as_view, name=op.name)
new_ins = [node.inputs[0]] + nw_ts + nw_fs new_ins = [node.inputs[0]] + nw_ts + nw_fs
new_outs = new_ifelse(*new_ins, return_list=True) new_outs = new_ifelse(*new_ins, return_list=True)
...@@ -712,7 +698,6 @@ def cond_merge_random_op(fgraph, main_node): ...@@ -712,7 +698,6 @@ def cond_merge_random_op(fgraph, main_node):
new_ifelse = IfElse( new_ifelse = IfElse(
n_outs=len(mn_ts + pl_ts), n_outs=len(mn_ts + pl_ts),
as_view=False, as_view=False,
gpu=False,
name=mn_name + "&" + pl_name, name=mn_name + "&" + pl_name,
) )
new_outs = new_ifelse(*new_ins, return_list=True) new_outs = new_ifelse(*new_ins, return_list=True)
......
...@@ -790,9 +790,6 @@ class ModuleCache: ...@@ -790,9 +790,6 @@ class ModuleCache:
if subdirs_elem == "lock_dir": if subdirs_elem == "lock_dir":
continue continue
root = os.path.join(self.dirname, subdirs_elem) root = os.path.join(self.dirname, subdirs_elem)
# Don't delete the gpuarray kernel cache
if root == config.gpuarray__cache_path:
continue
key_pkl = os.path.join(root, "key.pkl") key_pkl = os.path.join(root, "key.pkl")
if key_pkl in self.loaded_key_pkl: if key_pkl in self.loaded_key_pkl:
continue continue
......
...@@ -496,8 +496,6 @@ class CLinkerType(CLinkerObject): ...@@ -496,8 +496,6 @@ class CLinkerType(CLinkerObject):
e.g: e.g:
- For ``TensorType(dtype='int64', ...)``: should return ``"npy_int64"``. - For ``TensorType(dtype='int64', ...)``: should return ``"npy_int64"``.
- For ``GpuArrayType(dtype='int32', ...)``: should return ``"ga_int"``.
""" """
return "" return ""
......
...@@ -7,7 +7,7 @@ used to create a Params object that is compatible with the ParamsType defined. ...@@ -7,7 +7,7 @@ used to create a Params object that is compatible with the ParamsType defined.
The Params object will be available in both Python code (as a standard Python object) and C code The Params object will be available in both Python code (as a standard Python object) and C code
(as a specific struct with parameters as struct fields). To be fully-available in C code, Aesara (as a specific struct with parameters as struct fields). To be fully-available in C code, Aesara
types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType, GpuArrayType, types wrapped into a ParamsType must provide a C interface (e.g. TensorType, ScalarType,
or your own type. See :ref:`extending_op_params` for more details). or your own type. See :ref:`extending_op_params` for more details).
Example of usage Example of usage
...@@ -318,9 +318,8 @@ class Params(dict): ...@@ -318,9 +318,8 @@ class Params(dict):
class ParamsType(CType): class ParamsType(CType):
""" """
This class can create a struct of Aesara types (like `TensorType`, This class can create a struct of Aesara types (like `TensorType`, etc.)
`GpuArrayType`, etc.) to be used as a convenience op parameter wrapping to be used as a convenience `Op` parameter wrapping many data.
many data.
`ParamsType` constructor takes key-value args. Key will be the name of the `ParamsType` constructor takes key-value args. Key will be the name of the
attribute in the struct. Value is the Aesara type of this attribute, attribute in the struct. Value is the Aesara type of this attribute,
......
"""This script trigger convolution operation. We think it cause more
GPU power consumption then gemm call.
"""
import numpy as np
import aesara
from aesara.configdefaults import config
from aesara.gpuarray import dnn
from aesara.tensor.nnet.abstract_conv import get_conv_output_shape
from aesara.tensor.type import tensor4
def burn():
sz = 128
img_shp = [sz, sz, sz, sz]
kern_shp = [sz // 2, sz, 3, 3]
out_shp = get_conv_output_shape(img_shp, kern_shp, "valid", (1, 1))
img = tensor4("img")
kern = tensor4("kern")
out = tensor4("out")
def rand(shp):
return np.random.rand(*shp).astype(config.floatX)
img = aesara.shared(rand(img_shp))
kern = aesara.shared(rand(kern_shp))
out = aesara.shared(rand(out_shp))
# beta 1 is needed to force the reuse of out, otherwise, it is
# replaced by a GpuAllocEmpty
o1 = dnn._dnn_conv(img, kern, conv_mode="conv", out=out, beta=1.0)
mode = aesara.compile.get_default_mode().including("local_remove_all_assert")
f = aesara.function([], [o1], mode=mode)
aesara.printing.debugprint(f)
print("Start computation")
for i in range(10000):
f.fn()
print("Computation stopped")
if __name__ == "__main__":
burn()
...@@ -78,12 +78,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order= ...@@ -78,12 +78,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
f() # Ignore first function call to get representative time. f() # Ignore first function call to get representative time.
if execute: if execute:
try: # sync was needed for gpu
from aesara.gpuarray import GpuArraySharedVariable sync = False
sync = isinstance(c, GpuArraySharedVariable)
except ImportError:
sync = False
if sync: if sync:
# Make sure we don't include the time from the first call # Make sure we don't include the time from the first call
......
#! /usr/bin/env python
"""
This file compare the runtime of two independent dot products on one
and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import threading
import time
import numpy as np
import aesara
from aesara.gpuarray import init_dev
from aesara.gpuarray.blas import gpu_dot22
def main(dev1, dev2):
init_dev(dev1, "ctx1")
init_dev(dev2, "ctx2")
size = 1024 * 16
data = np.random.randn(size, size).astype("float32")
val1a = aesara.shared(data, target="ctx1")
val1b = aesara.shared(data, target="ctx1")
val1c = aesara.shared(data, target="ctx1")
val1d = aesara.shared(data, target="ctx1")
val2a = aesara.shared(data, target="ctx2")
val2b = aesara.shared(data, target="ctx2")
f1 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val1c, val1d)])
f2 = aesara.function([], [gpu_dot22(val1a, val1b), gpu_dot22(val2a, val2b)])
f3 = aesara.function([], [gpu_dot22(val1a, val1b)])
f4 = aesara.function([], [gpu_dot22(val2a, val2b)])
f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")])
f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")])
# pre-execute to load code to GPU.
r = f1.fn()
r[0].sync(), r[1].sync()
r = f2.fn()
r[0].sync(), r[1].sync()
r = f3.fn()
r[0].sync()
r = f4.fn()
r[0].sync()
r = f5.fn()
r = f6.fn()
r = None
t = time.time()
r = f1.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print(f"one ctx async {t2 - t:f}")
t = time.time()
r = f2.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print(f"two ctx async {t2 - t:f}")
t = time.time()
r = f3.fn()
r2 = f4.fn()
r[0].sync()
r2[0].sync()
t2 = time.time()
r = None
print(f"two ctx, 2 fct async {t2 - t:f}")
t = time.time()
r = f5.fn()
r2 = f6.fn()
t2 = time.time()
r = None
print(f"two ctx, 2 fct with transfer {t2 - t:f}")
# Multi-thread version
class myThread(threading.Thread):
def __init__(self, name, f, sync):
threading.Thread.__init__(self)
self.f = f
self.name = name
self.sync = sync
def run(self):
# print "Starting " + self.name
# r = self.f.fn(n_calls=10)
r = self.f()
# print "End " + self.name
if self.sync:
r[0].sync()
self.r = r
# print "Exiting " + self.name
thread1 = myThread("Thread-3", f3, True)
thread2 = myThread("Thread-4", f4, True)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}")
thread1 = myThread("Thread-5", f5, False)
thread2 = myThread("Thread-6", f6, False)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
raise ValueError("This script require two device names.")
main(sys.argv[1], sys.argv[2])
""" """
Function to detect memory sharing for ndarray AND sparse type AND GpuArray. Function to detect memory sharing for ndarray AND sparse type.
numpy version support only ndarray. numpy version support only ndarray.
""" """
...@@ -18,48 +18,22 @@ try: ...@@ -18,48 +18,22 @@ try:
return scipy.sparse.issparse(a) return scipy.sparse.issparse(a)
except ImportError: except ImportError:
# scipy not imported, their can be only ndarray and gpuarray
def _is_sparse(a):
return False
from aesara import gpuarray
if gpuarray.pygpu:
def _is_gpua(a): def _is_sparse(a):
return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
else:
def _is_gpua(a):
return False return False
__docformat__ = "restructuredtext en"
def may_share_memory(a, b, raise_other_type=True): def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, np.ndarray) a_ndarray = isinstance(a, np.ndarray)
b_ndarray = isinstance(b, np.ndarray) b_ndarray = isinstance(b, np.ndarray)
if a_ndarray and b_ndarray: if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b) return TensorType.may_share_memory(a, b)
a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b)
if a_gpua and b_gpua:
return gpuarray.pygpu.gpuarray.may_share_memory(a, b)
a_sparse = _is_sparse(a) a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b) b_sparse = _is_sparse(b)
if not (a_ndarray or a_sparse or a_gpua) or not (b_ndarray or b_sparse or b_gpua): if not (a_ndarray or a_sparse) or not (b_ndarray or b_sparse):
if raise_other_type: if raise_other_type:
raise TypeError( raise TypeError("may_share_memory support only ndarray" " and scipy.sparse")
"may_share_memory support only ndarray"
" and scipy.sparse or GpuArray type"
)
return False return False
if a_gpua or b_gpua:
return False
return SparseTensorType.may_share_memory(a, b) return SparseTensorType.may_share_memory(a, b)
...@@ -9,7 +9,6 @@ import os ...@@ -9,7 +9,6 @@ import os
import pickle import pickle
import sys import sys
import tempfile import tempfile
import warnings
import zipfile import zipfile
from collections import defaultdict from collections import defaultdict
from contextlib import closing from contextlib import closing
...@@ -27,7 +26,6 @@ except ImportError: ...@@ -27,7 +26,6 @@ except ImportError:
DEFAULT_PROTOCOL = HIGHEST_PROTOCOL DEFAULT_PROTOCOL = HIGHEST_PROTOCOL
from aesara.compile.sharedvalue import SharedVariable from aesara.compile.sharedvalue import SharedVariable
from aesara.configdefaults import config
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
...@@ -121,30 +119,7 @@ class PersistentNdarrayID: ...@@ -121,30 +119,7 @@ class PersistentNdarrayID:
return self.seen[id(obj)] return self.seen[id(obj)]
class PersistentGpuArrayID(PersistentNdarrayID): class PersistentSharedVariableID(PersistentNdarrayID):
def __call__(self, obj):
from aesara.gpuarray.type import _name_for_ctx
try:
import pygpu
except ImportError:
pygpu = None
if pygpu and isinstance(obj, pygpu.gpuarray.GpuArray):
if id(obj) not in self.seen:
def write_array(f):
pickle.dump(_name_for_ctx(obj.context), f, 2)
np.lib.format.write_array(f, np.asarray(obj))
name = self._resolve_name(obj)
zipadd(write_array, self.zip_file, name)
self.seen[id(obj)] = f"gpuarray.{name}"
return self.seen[id(obj)]
return super().__call__(obj)
class PersistentSharedVariableID(PersistentGpuArrayID):
"""Uses shared variable names when persisting to zip file. """Uses shared variable names when persisting to zip file.
If a shared variable has a name, this name is used as the name of the If a shared variable has a name, this name is used as the name of the
...@@ -213,32 +188,16 @@ class PersistentNdarrayLoad: ...@@ -213,32 +188,16 @@ class PersistentNdarrayLoad:
self.cache = {} self.cache = {}
def __call__(self, persid): def __call__(self, persid):
from aesara.gpuarray import pygpu
from aesara.gpuarray.type import get_context
array_type, name = persid.split(".") array_type, name = persid.split(".")
del array_type
# array_type was used for switching gpu/cpu arrays
# it is better to put these into sublclasses properly
# this is more work but better logic
if name in self.cache: if name in self.cache:
return self.cache[name] return self.cache[name]
ret = None ret = None
if array_type == "gpuarray": with self.zip_file.open(name) as f:
with self.zip_file.open(name) as f: ret = np.lib.format.read_array(f)
ctx_name = pickle.load(f)
array = np.lib.format.read_array(f)
if config.experimental__unpickle_gpu_on_cpu:
# directly return numpy array
warnings.warn(
"config.experimental__unpickle_gpu_on_cpu is set "
"to True. Unpickling GpuArray as numpy.ndarray"
)
ret = array
elif pygpu:
ret = pygpu.array(array, context=get_context(ctx_name))
else:
raise ImportError("pygpu not found. Cannot unpickle GpuArray")
else:
with self.zip_file.open(name) as f:
ret = np.lib.format.read_array(f)
self.cache[name] = ret self.cache[name] = ret
return ret return ret
......
...@@ -12,7 +12,7 @@ from aesara.graph.op import get_test_value ...@@ -12,7 +12,7 @@ from aesara.graph.op import get_test_value
from aesara.graph.utils import MissingInputError, TestValueError from aesara.graph.utils import MissingInputError, TestValueError
from aesara.scan import utils from aesara.scan import utils
from aesara.scan.op import Scan, ScanInfo from aesara.scan.op import Scan, ScanInfo
from aesara.scan.utils import safe_new, traverse from aesara.scan.utils import safe_new
from aesara.tensor.exceptions import NotScalarConstantError from aesara.tensor.exceptions import NotScalarConstantError
from aesara.tensor.math import minimum from aesara.tensor.math import minimum
from aesara.tensor.shape import shape_padleft from aesara.tensor.shape import shape_padleft
...@@ -968,29 +968,8 @@ def scan( ...@@ -968,29 +968,8 @@ def scan(
) )
if condition is not None: if condition is not None:
inner_outs.append(condition) inner_outs.append(condition)
# gpuarray is imported here, instead of being imported on top of # NOTE: legacy code traversed GPU types
# the file because that would force on the user some dependencies that we new_givens = givens
# might do not want to. Currently we are working on removing the
# dependencies on sandbox code completely.
from aesara import gpuarray
if gpuarray.pygpu_activated:
# very often we end up in this situation when we want to
# replace w with w_copy, where w is a GPU variable
# and w_copy is TensorType. This is caused because shared
# variables are put on GPU right away >:| ,
new_givens = OrderedDict()
for w, w_copy in givens.items():
if isinstance(w.type, gpuarray.GpuArrayType) and isinstance(
w_copy.type, TensorType
):
for o in inner_outs:
new_givens = traverse(o, w, w_copy, new_givens)
else:
new_givens[w] = w_copy
else:
new_givens = givens
new_outs = clone_replace(inner_outs, replace=new_givens) new_outs = clone_replace(inner_outs, replace=new_givens)
...@@ -1023,7 +1002,6 @@ def scan( ...@@ -1023,7 +1002,6 @@ def scan(
mode=mode, mode=mode,
truncate_gradient=truncate_gradient, truncate_gradient=truncate_gradient,
name=name, name=name,
gpua=False,
as_while=as_while, as_while=as_while,
profile=profile, profile=profile,
allow_gc=allow_gc, allow_gc=allow_gc,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论