提交 cc584d6c authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove GPU references in docstrings and comments

上级 a4ed0e85
......@@ -183,7 +183,7 @@ def rebuild_collect_shared(
(store_into, update_d[store_into]),
)
# filter_variable ensure smooth conversion of cpu/gpu Types
# filter_variable ensure smooth conversion of cpu Types
try:
update_val = store_into.type.filter_variable(
update_val, allow_convert=False
......
......@@ -1097,7 +1097,7 @@ class Function:
return [i.variable for i in self.maker.inputs if i.implicit]
def sync_shared(self):
# sync was needed on old gpu backend
# NOTE: sync was needed on old gpu backend
pass
......
......@@ -508,8 +508,8 @@ class ProfileStats:
tot += t
ftot = tot * 100 / local_time
# Remove the useless start and end of the class name:
# "<class 'aesara.backend.blas.GpuDot22'>" ->
# "aesara.backend.blas.GpuDot22"
# "<class 'aesara.backend.blas.Dot22'>" ->
# "aesara.backend.blas.Dot22"
class_name = str(a)[8:-2][:maxlen]
print(
format_str
......@@ -887,6 +887,7 @@ class ProfileStats:
new allocation.
"""
# TODO: GPU is not supported for now, needs to be refactored later
# Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0]
running_memory_size = [0, 0]
......@@ -1241,6 +1242,7 @@ class ProfileStats:
max_running_max_memory_size[0], sum(running_memory[2])
)
# NOTE: we do not have GPU right now, this has to be reconsidered later
# Separate CPU and GPU
max_node_memory_size[1] = max(
max_node_memory_size[1], running_memory[0][0]
......@@ -1624,12 +1626,6 @@ class ProfileStats:
"experimental, but seems to work correctly.",
file=file,
)
if config.device.startswith("gpu"):
print(
" - MRG_RandomStream is the only random number"
" generator supported on the GPU.",
file=file,
)
break
# tip 6
......
......@@ -120,28 +120,6 @@ class SharedVariable(Variable):
Changes to this value will be visible to all functions using
this SharedVariable.
Notes
-----
Set_value will work in-place on the GPU, if
the following conditions are met:
* The destination on the GPU must be c_contiguous.
* The source is on the CPU.
* The old value must have the same dtype as the new value
(which is a given for now, since only float32 is
supported).
* The old and new value must have the same shape.
* The old value is being completely replaced by the new
value (not partially modified, e.g. by replacing some
subtensor of it).
It is also worth mentioning that, for efficient transfer to the GPU,
Aesara will make the new data ``c_contiguous``. This can require an
extra copy of the data on the host.
The inplace on gpu memory work when borrow is either True or False.
"""
if borrow:
self.container.value = new_value
......
......@@ -50,9 +50,7 @@ def d3viz(fct, outfile, copy_deps=True, *args, **kwargs):
edited by selecting Edit from the context menu.
Input nodes are colored in green, output nodes in blue. Apply nodes are
ellipses, and colored depending on the type of operation they perform. Red
ellipses are transfers from/to the GPU (ops with names GpuFromHost,
HostFromGpu).
ellipses, and colored depending on the type of operation they perform.
Edges are black by default. If a node returns a view of an
input, the input edge will be blue. If it returns a destroyed input, the
......
......@@ -52,8 +52,6 @@ class PyDotFormatter:
"unused": "lightgrey",
}
self.apply_colors = {
"GpuFromHost": "red",
"HostFromGpu": "red",
"Scan": "yellow",
"Shape": "cyan",
"IfElse": "magenta",
......
......@@ -237,7 +237,7 @@ def Rop(f, wrt, eval_points, disconnected_outputs="raise", return_disconnected="
)
except AttributeError:
# wrt_elem and eval_point don't always have ndim like random type
# Tensor, Sparse and GpuArray have the ndim attribute
# Tensor, Sparse have the ndim attribute
pass
seen_nodes = OrderedDict()
......
......@@ -107,14 +107,13 @@ class Type(MetaObject):
This method allows one to reuse old allocated memory. If this method
is implemented, it will be called instead of `Type.filter`.
As of now, this method is only used when we transfer new data to a
shared variable on a GPU.
As of now, this method is not implemented and was previously used for transferring memory to and from GPU.
Parameters
----------
value: array-like
storage: array-like
The old value (e.g. the old NumPy array, CudaNdarray, etc.)
The old value (e.g. the old NumPy array)
strict: bool
allow_downcast: bool (optional)
......
......@@ -1189,7 +1189,7 @@ class ModuleCache:
# 2) If other repo that import Aesara have Aesara ops defined,
# we need to refresh the cache here. Otherwise, there are import
# order problems.
# When device=gpu, we compile during Aesara
# (Outdated) When device=gpu, we compile during Aesara
# import. This triggers the loading of the cache. But
# unpickling the cache asks that the external Ops are
# completely loaded, which isn't always the case!
......
......@@ -67,10 +67,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
impl = "CPU (with direct Aesara binding to blas)"
else:
impl = "CPU (without direct Aesara binding to blas but with numpy/scipy binding to blas)"
elif any(x.op.__class__.__name__ == "GpuGemm" for x in f.maker.fgraph.toposort()):
impl = "GPU"
else:
impl = "ERROR, unable to tell if Aesara used the cpu or the gpu:\n"
impl = "ERROR, unable to tell if Aesara used the cpu:\n"
impl += str(f.maker.fgraph.toposort())
t0 = 0
......@@ -78,7 +76,7 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
f() # Ignore first function call to get representative time.
if execute:
# sync was needed for gpu
# NOTE: sync was needed for gpu
sync = False
if sync:
......
......@@ -1014,8 +1014,6 @@ Print to the terminal a math-like expression.
# colors not used: orange, amber#FFBF00, purple, pink,
# used by default: green, blue, grey, red
default_colorCodes = {
"GpuFromHost": "red",
"HostFromGpu": "red",
"Scan": "yellow",
"Shape": "brown",
"IfElse": "magenta",
......
......@@ -19,8 +19,7 @@ from aesara.tensor.type import zmatrix
message = (
"The module aesara.sandbox.fourier will soon be deprecated."
" Please use aesara.tensor.fft, which supports gradients and "
"automatic optimization transfers to the GPU ops."
" Please use aesara.tensor.fft, which supports gradients."
)
warnings.warn(message)
......
......@@ -394,13 +394,13 @@ class mrg_uniform(COp, mrg_uniform_base):
for s in size:
n_elements *= s
if n_elements > M1:
# The limit is on the C and GPU code. This perform don't
# The limit is on the C code. This perform don't
# have this limit. But to have all of them behave the
# same (and have DebugMode don't use too much memory for
# some rng_mrg tests) I also add this limit here.
raise ValueError("rng_mrg does not support more then (2**31 -1) samples")
rstate = np.asarray(rstate) # bring state from GPU if necessary
rstate = np.asarray(rstate) # bring state from XXX if necessary
if not self.inplace:
rstate = rstate.copy()
......@@ -527,8 +527,7 @@ class mrg_uniform(COp, mrg_uniform_base):
def c_code(self, node, name, inp, out, sub):
# If we try to use the C code here with something else than a
# TensorType, something is wrong (likely one of the GPU ops
# not defining C code correctly).
# TensorType, something is wrong.
assert isinstance(node.inputs[0].type, TensorType)
if self.output_type.dtype == "float16":
# C code is not tested, fall back to Python
......
......@@ -26,8 +26,6 @@ of using ``scan`` over `for` loops in python (among others) are:
* it allows computing gradients through the for loop
* there exist a bunch of optimizations that help re-write your loop
such that less memory is used and that it runs faster
* it ensures that data is not copied from host to gpu and gpu to
host at each step
The Scan Op should typically be used by calling any of the following
functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
......
......@@ -277,10 +277,6 @@ def scan(
allocations are freed at the end of all iterations; this is what the
flag `aesara.config.allow_gc` means.
If you use pre-allocation and this `Scan` is on GPU, the speed up from
`allow_gc` is small. If you are missing memory, disabling `allow_gc`
could help you run graph that request much memory.
strict
If ``True``, all the shared variables used in `fn` must be provided as a
part of `non_sequences` or `sequences`.
......
......@@ -1714,7 +1714,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
elif isinstance(self.fn.maker.fgraph.outputs[idx], TensorVariable):
old_inner_output_data[idx] = var.data
else:
raise RuntimeError("old_inner_output_data[idx] = var.gpudata")
raise RuntimeError(
"FIXME: old_inner_output_data[idx] = var.gpudata"
)
# 4.6. Keep a reference to the variables (ndarrays,
# etc) associated with mitmot inputs currently in the
......@@ -1849,7 +1851,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
output_reused = new_var.data == old_data
else:
raise RuntimeError(
"output_reused = new_var.gpudata == old_data"
"FIXME: output_reused = new_var.gpudata == old_data"
)
else:
output_reused = False
......@@ -1915,7 +1917,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
output_reused = new_var.data == old_data
else:
raise RuntimeError(
"output_reused = new_var.gpudata == old_data"
"FIXME: output_reused = new_var.gpudata == old_data"
)
else:
output_reused = False
......
......@@ -1649,11 +1649,6 @@ class Alloc(COp):
)
):
return False
# If the clients is a transfer to the GPU, we don't want to
# fold. We let the Alloc being moved to the GPU, then we
# let the GPU algo decide if it need to fold it or not.
elif client[0].op.__class__.__name__.lower().startswith("gpu"):
return False
return True
......@@ -2215,8 +2210,7 @@ def addbroadcast(x, *axes):
x broadcastable. When performing the function, if the length of
x along that dimension is not 1, a ValueError will be raised.
We apply the opt here not to pollute the graph especially during
the gpu optimization
We apply the opt here not to pollute the graph
Parameters
----------
......@@ -2252,8 +2246,7 @@ def unbroadcast(x, *axes):
of x broadcastable. When performing the function, if the length
of x along that dimension is not 1, a ValueError will be raised.
We apply the opt here not to pollute the graph especially during
the gpu optimization
We apply the opt here not to pollute the graph
Parameters
----------
......
......@@ -169,7 +169,7 @@ def broadcast_like(value, template, fgraph, dtype=None):
class InplaceElemwiseOptimizer(GlobalOptimizer):
r"""
This is parameterized so that it works for `Elemwise` and `GpuElemwise` `Op`\s.
This is parameterized so that it works for `Elemwise` `Op`\s.
"""
def __init__(self, OP):
......@@ -1343,8 +1343,7 @@ class ShapeFeature(features.Feature):
if repl.owner is shpnode:
# This mean the replacement shape object is
# exactly the same as the current shape object. So
# no need for replacement. This happen for example
# with the InputToGpuOptimizer optimizer.
# no need for replacement.
continue
if (
repl.owner
......@@ -1841,9 +1840,7 @@ def local_alloc_empty_to_zeros(fgraph, node):
This help investigate NaN with NanGuardMode. Not registered by
default. To activate it, use the Aesara flag
optimizer_including=alloc_empty_to_zeros. This also enable
the GPU version of this optimizations.
optimizer_including=alloc_empty_to_zeros.
"""
if isinstance(node.op, AllocEmpty):
return [zeros(node.inputs, dtype=node.outputs[0].dtype)]
......@@ -3000,19 +2997,15 @@ def local_elemwise_fusion_op(op_class, max_input_fct=lambda node: 32, maker=None
and each `Elemwise`'s scalar `Op`, and use the composite scalar `Op` in a
new "fused" `Elemwise`.
It's parameterized in order to work for `Elemwise` and `GpuElemwise` `Op`\s.
It's parameterized in order to work for `Elemwise` `Op`\s.
Parameters
----------
op_class : type
`GpuElemwise` or `Elemwise` class (the one that we want to fuse)
`Elemwise` class (the one that we want to fuse)
max_input_fct : callable
A function that returns the maximum number of inputs that this `Elemwise`
can take (useful for `GpuElemwise`). The GPU kernel currently has a
limit of 256 bytes for the size of all parameters passed to it. As
currently we pass a lot of information only by parameter, we must limit how
many `Op`\s we fuse together to avoid busting that 256 limit.
can take.
On the CPU we limit to 32 input variables since that is the maximum
NumPy support.
......
......@@ -1192,7 +1192,7 @@ def to_one_hot(y, nb_class, dtype=None):
class Unique(Op):
"""
Wraps `numpy.unique`. This `Op` is not implemented on the GPU.
Wraps `numpy.unique`.
Examples
--------
......
......@@ -22,8 +22,7 @@ from aesara.tensor.var import TensorConstant
class Fourier(Op):
"""
WARNING: for officially supported FFTs, use aesara.tensor.fft, which
provides real-input FFTs. Gradients are supported, as well as optimization
transfers to GPU ops.
provides real-input FFTs. Gradients are supported.
An instance of this class returns a finite fourier transform calculated
along one dimension of an input array.
......
......@@ -1550,11 +1550,6 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, acc_dtype=None)
necessarily be the dtype of the output (in particular
if it is a discrete (int/uint) dtype, the output will
be in a float type). If None, then we use the same rules as `sum()`.
Notes
-----
For gpu, if you specify dtype=float32, everything will be done on the gpu.
"""
input = as_tensor_variable(input)
if op:
......
......@@ -1673,8 +1673,7 @@ def local_reduce_broadcastable(fgraph, node):
axis = list(node.op.axis)
cuttable = [a for a in axis if reduced.broadcastable[a]]
if cuttable:
# -- we can remove some axes of summation,
# which simplifies the codegen for sum, especially on GPU
# -- we can remove some axes of summation.
new_axis = []
pattern = []
ii = 0
......@@ -1857,10 +1856,6 @@ def local_pow_canonicalize(fgraph, node):
def local_mul_to_sqr(fgraph, node):
"""
x*x -> sqr(x)
This is faster on the GPU when memory fetching is a big part of
the computation time.
"""
if node.op == mul:
if len(node.inputs) == 2:
......
......@@ -193,11 +193,6 @@ def matrix_dot(*args):
def trace(X):
"""
Returns the sum of diagonal elements of matrix X.
Notes
-----
Works on GPU since 0.6rc4.
"""
return extract_diag(X).sum()
......@@ -729,7 +724,6 @@ class TensorInv(Op):
def tensorinv(a, ind=2):
"""
Does not run on GPU;
Aesara utilization of numpy.linalg.tensorinv;
Compute the 'inverse' of an N-dimensional array.
......@@ -791,7 +785,7 @@ class TensorSolve(Op):
def tensorsolve(a, b, axes=None):
"""
Aesara utilization of numpy.linalg.tensorsolve. Does not run on GPU!
Aesara utilization of numpy.linalg.tensorsolve.
Solve the tensor equation ``a x = b`` for x.
It is assumed that all indices of `x` are summed over in the product,
......
......@@ -1048,16 +1048,6 @@ def conv3d(
Set of feature maps generated by convolutional layer. Tensor is
is of shape (batch size, output channels, output depth,
output rows, output columns)
Notes
-----
If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
"caffe style convolution".
This is only supported in Aesara 0.8 or the development
version until it is released.
"""
input = as_tensor_variable(input)
filters = as_tensor_variable(filters)
......@@ -1184,17 +1174,6 @@ def conv2d_grad_wrt_inputs(
set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output rows, output
columns)
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
"""
filters = as_tensor_variable(filters)
......@@ -1347,17 +1326,6 @@ def conv3d_grad_wrt_inputs(
set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output depth,
output rows, output columns)
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
"""
filters = as_tensor_variable(filters)
......@@ -1500,17 +1468,6 @@ def conv2d_grad_wrt_weights(
columns) for normal convolution and
(output channels, output rows, output columns, input channels,
filter rows, filter columns) for unshared convolution
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
"""
input = as_tensor_variable(input)
......@@ -1644,17 +1601,6 @@ def conv3d_grad_wrt_weights(
set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output time, output
rows, output columns)
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
"""
input = as_tensor_variable(input)
......@@ -3685,19 +3631,6 @@ def conv2d(
Symbolic 4D tensor
Set of feature maps generated by convolutional layer. Tensor is
of shape (batch size, output channels, output rows, output columns)
Notes
-----
If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
This is only supported in Aesara 0.8 or the development
version until it is released.
The parameter filter_dilation is an implementation of `dilated
convolution <https://arxiv.org/pdf/1511.07122v3.pdf>`_.
"""
if "imshp_logical" in kwargs or "kshp_logical" in kwargs:
......@@ -3822,18 +3755,6 @@ def conv2d_transpose(
Symbolic 4D tensor
Set of feature maps generated by the transposed convolution. Tensor is
of shape (batch size, output channels, output rows, output columns)
Notes
-----
If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
This operation is also sometimes called "deconvolution".
The parameter filter_dilation is an implementation of `dilated
convolution <https://arxiv.org/pdf/1511.07122v3.pdf>`_.
"""
return conv2d_grad_wrt_inputs(
......
......@@ -4,13 +4,6 @@ Provides neural-network specific Ops.
Notes
-----
TODO: factor this out into a neural-network toolbox.
We register all optimization with the gpu tag as we don't
implement all the intermediate case on the GPU (in particular
AdvancedSubtensor). So to make sure it run well on the gpu with
fast_compile, we register them as needed for the GPU. This can be
revisited later when all the intermediate part are on the GPU.
"""
import warnings
......
......@@ -47,7 +47,6 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"):
"""
This function will build the symbolic graph for applying batch normalization
to a set of activations.
Also works on GPUs, but is not optimized using cuDNN.
.. versionadded:: 0.7.1
......
......@@ -44,9 +44,6 @@ class DiagonalSubtensor(Op):
i1
Axis index in x
Notes
-----
Work on the GPU.
Extended summary
----------------
......@@ -204,8 +201,6 @@ def conv3d(
Another way to define signals: (batch, time, in channel, row, column)
Another way to define filters: (out channel,time,in channel, row, column)
For the GPU, use nnet.conv3d.
See Also
--------
Someone made a script that shows how to swap the axes between
......
......@@ -194,7 +194,7 @@ class Images2Neibs(COp):
def perform(self, node, inp, out_, params):
ten4, neib_shape, neib_step = inp
(z,) = out_
# GpuImages2Neibs should not run this perform in DebugMode
# XXX: GpuImages2Neibs should not run this perform in DebugMode
if not isinstance(self, Images2Neibs):
raise aesara.graph.utils.MethodNotDefined()
......
......@@ -591,9 +591,7 @@ def local_abstractconv_check(fgraph, node):
):
raise LocalMetaOptimizerSkipAssertionError(
f"{node.op.__class__.__name__} Aesara optimization failed: there is no implementation "
"available supporting the requested options. Did you exclude "
'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, '
"is cuDNN available and does the GPU support it? If on CPU, "
"available supporting the requested options. If on CPU, "
"do you have a BLAS library installed Aesara can link against? "
"On the CPU we do not support float16."
)
......
......@@ -146,12 +146,7 @@ def pool_2d(
"pool_2d() will have the parameter ignore_border"
" default value changed to True (currently"
" False). To have consistent behavior with all Aesara"
" version, explicitly add the parameter ignore_border=True."
" On the GPU, using ignore_border=True is needed to use cuDNN."
" When using ignore_border=False and not using cuDNN, the only"
" GPU combination supported is when"
" `ws == stride and pad == (0, 0) and mode == 'max'`."
" Otherwise, the convolution will be executed on CPU.",
" version, explicitly add the parameter ignore_border=True.",
category=DeprecationWarning,
stacklevel=2,
)
......@@ -267,12 +262,7 @@ def pool_3d(
"pool_3d() will have the parameter ignore_border"
" default value changed to True (currently"
" False). To have consistent behavior with all Aesara"
" version, explicitly add the parameter ignore_border=True."
" On the GPU, using ignore_border=True is needed to use cuDNN."
" When using ignore_border=False and not using cuDNN, the only"
" GPU combination supported is when"
" `ws == stride and pad == (0, 0, 0) and mode == 'max'`."
" Otherwise, the convolution will be executed on CPU.",
" version, explicitly add the parameter ignore_border=True.",
category=DeprecationWarning,
stacklevel=2,
)
......
......@@ -429,8 +429,6 @@ def solve_triangular(
class Solve(SolveBase):
"""
Solve a system of linear equations.
For on CPU and GPU.
"""
__props__ = (
......
......@@ -328,12 +328,10 @@ class TopKOp(Op):
Notes
-----
- CPU and GPU ops don't produce same output order. This is expected.
- The output order is not guaranteed. On the CPU, we use
``np.partition`` and ``np.argpartition`` that only make sure the
k-th element is the correct one and that the other
elements are on the correct side. On the GPU, they
look sorted, but we do not test the correctness of this behavior.
elements are on the correct side.
- By default, this Op gives two outputs: values and indices. However
optimizers may remove a certain output if not needed.
- Computing the gradient requests the computation of the indices in
......
......@@ -50,15 +50,10 @@ class OrderedUpdates(OrderedDict):
# TODO: consider doing error-checking on value.
# insist that it is an Aesara variable? Have the right type?
# This could have weird consequences - for example a
# GPU SharedVariable is customarily associated with a TensorType
# value. Should it be cast to a GPU value right away? Should
# literals be transformed into constants immediately?
return super().__setitem__(key, value)
else:
raise TypeError(
"OrderedUpdates keys must inherit from " "SharedVariable", key
)
raise TypeError("OrderedUpdates keys must inherit from SharedVariable", key)
def update(self, other=None):
if other is None:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论