提交 cc584d6c authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove GPU references in docstrings and comments

上级 a4ed0e85
...@@ -183,7 +183,7 @@ def rebuild_collect_shared( ...@@ -183,7 +183,7 @@ def rebuild_collect_shared(
(store_into, update_d[store_into]), (store_into, update_d[store_into]),
) )
# filter_variable ensure smooth conversion of cpu/gpu Types # filter_variable ensure smooth conversion of cpu Types
try: try:
update_val = store_into.type.filter_variable( update_val = store_into.type.filter_variable(
update_val, allow_convert=False update_val, allow_convert=False
......
...@@ -1097,7 +1097,7 @@ class Function: ...@@ -1097,7 +1097,7 @@ class Function:
return [i.variable for i in self.maker.inputs if i.implicit] return [i.variable for i in self.maker.inputs if i.implicit]
def sync_shared(self): def sync_shared(self):
# sync was needed on old gpu backend # NOTE: sync was needed on old gpu backend
pass pass
......
...@@ -508,8 +508,8 @@ class ProfileStats: ...@@ -508,8 +508,8 @@ class ProfileStats:
tot += t tot += t
ftot = tot * 100 / local_time ftot = tot * 100 / local_time
# Remove the useless start and end of the class name: # Remove the useless start and end of the class name:
# "<class 'aesara.backend.blas.GpuDot22'>" -> # "<class 'aesara.backend.blas.Dot22'>" ->
# "aesara.backend.blas.GpuDot22" # "aesara.backend.blas.Dot22"
class_name = str(a)[8:-2][:maxlen] class_name = str(a)[8:-2][:maxlen]
print( print(
format_str format_str
...@@ -887,6 +887,7 @@ class ProfileStats: ...@@ -887,6 +887,7 @@ class ProfileStats:
new allocation. new allocation.
""" """
# TODO: GPU is not supported for now, needs to be refactored later
# Initial Mem info values [CPU, GPU] # Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0] node_memory_size = [0, 0]
running_memory_size = [0, 0] running_memory_size = [0, 0]
...@@ -1241,6 +1242,7 @@ class ProfileStats: ...@@ -1241,6 +1242,7 @@ class ProfileStats:
max_running_max_memory_size[0], sum(running_memory[2]) max_running_max_memory_size[0], sum(running_memory[2])
) )
# NOTE: we do not have GPU right now, this has to be reconsidered later
# Separate CPU and GPU # Separate CPU and GPU
max_node_memory_size[1] = max( max_node_memory_size[1] = max(
max_node_memory_size[1], running_memory[0][0] max_node_memory_size[1], running_memory[0][0]
...@@ -1624,12 +1626,6 @@ class ProfileStats: ...@@ -1624,12 +1626,6 @@ class ProfileStats:
"experimental, but seems to work correctly.", "experimental, but seems to work correctly.",
file=file, file=file,
) )
if config.device.startswith("gpu"):
print(
" - MRG_RandomStream is the only random number"
" generator supported on the GPU.",
file=file,
)
break break
# tip 6 # tip 6
......
...@@ -120,28 +120,6 @@ class SharedVariable(Variable): ...@@ -120,28 +120,6 @@ class SharedVariable(Variable):
Changes to this value will be visible to all functions using Changes to this value will be visible to all functions using
this SharedVariable. this SharedVariable.
Notes
-----
Set_value will work in-place on the GPU, if
the following conditions are met:
* The destination on the GPU must be c_contiguous.
* The source is on the CPU.
* The old value must have the same dtype as the new value
(which is a given for now, since only float32 is
supported).
* The old and new value must have the same shape.
* The old value is being completely replaced by the new
value (not partially modified, e.g. by replacing some
subtensor of it).
It is also worth mentioning that, for efficient transfer to the GPU,
Aesara will make the new data ``c_contiguous``. This can require an
extra copy of the data on the host.
The inplace on gpu memory work when borrow is either True or False.
""" """
if borrow: if borrow:
self.container.value = new_value self.container.value = new_value
......
...@@ -50,9 +50,7 @@ def d3viz(fct, outfile, copy_deps=True, *args, **kwargs): ...@@ -50,9 +50,7 @@ def d3viz(fct, outfile, copy_deps=True, *args, **kwargs):
edited by selecting Edit from the context menu. edited by selecting Edit from the context menu.
Input nodes are colored in green, output nodes in blue. Apply nodes are Input nodes are colored in green, output nodes in blue. Apply nodes are
ellipses, and colored depending on the type of operation they perform. Red ellipses, and colored depending on the type of operation they perform.
ellipses are transfers from/to the GPU (ops with names GpuFromHost,
HostFromGpu).
Edges are black by default. If a node returns a view of an Edges are black by default. If a node returns a view of an
input, the input edge will be blue. If it returns a destroyed input, the input, the input edge will be blue. If it returns a destroyed input, the
......
...@@ -52,8 +52,6 @@ class PyDotFormatter: ...@@ -52,8 +52,6 @@ class PyDotFormatter:
"unused": "lightgrey", "unused": "lightgrey",
} }
self.apply_colors = { self.apply_colors = {
"GpuFromHost": "red",
"HostFromGpu": "red",
"Scan": "yellow", "Scan": "yellow",
"Shape": "cyan", "Shape": "cyan",
"IfElse": "magenta", "IfElse": "magenta",
......
...@@ -237,7 +237,7 @@ def Rop(f, wrt, eval_points, disconnected_outputs="raise", return_disconnected=" ...@@ -237,7 +237,7 @@ def Rop(f, wrt, eval_points, disconnected_outputs="raise", return_disconnected="
) )
except AttributeError: except AttributeError:
# wrt_elem and eval_point don't always have ndim like random type # wrt_elem and eval_point don't always have ndim like random type
# Tensor, Sparse and GpuArray have the ndim attribute # Tensor, Sparse have the ndim attribute
pass pass
seen_nodes = OrderedDict() seen_nodes = OrderedDict()
......
...@@ -107,14 +107,13 @@ class Type(MetaObject): ...@@ -107,14 +107,13 @@ class Type(MetaObject):
This method allows one to reuse old allocated memory. If this method This method allows one to reuse old allocated memory. If this method
is implemented, it will be called instead of `Type.filter`. is implemented, it will be called instead of `Type.filter`.
As of now, this method is only used when we transfer new data to a As of now, this method is not implemented and was previously used for transferring memory to and from GPU.
shared variable on a GPU.
Parameters Parameters
---------- ----------
value: array-like value: array-like
storage: array-like storage: array-like
The old value (e.g. the old NumPy array, CudaNdarray, etc.) The old value (e.g. the old NumPy array)
strict: bool strict: bool
allow_downcast: bool (optional) allow_downcast: bool (optional)
......
...@@ -1189,7 +1189,7 @@ class ModuleCache: ...@@ -1189,7 +1189,7 @@ class ModuleCache:
# 2) If other repo that import Aesara have Aesara ops defined, # 2) If other repo that import Aesara have Aesara ops defined,
# we need to refresh the cache here. Otherwise, there are import # we need to refresh the cache here. Otherwise, there are import
# order problems. # order problems.
# When device=gpu, we compile during Aesara # (Outdated) When device=gpu, we compile during Aesara
# import. This triggers the loading of the cache. But # import. This triggers the loading of the cache. But
# unpickling the cache asks that the external Ops are # unpickling the cache asks that the external Ops are
# completely loaded, which isn't always the case! # completely loaded, which isn't always the case!
......
...@@ -67,10 +67,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order= ...@@ -67,10 +67,8 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
impl = "CPU (with direct Aesara binding to blas)" impl = "CPU (with direct Aesara binding to blas)"
else: else:
impl = "CPU (without direct Aesara binding to blas but with numpy/scipy binding to blas)" impl = "CPU (without direct Aesara binding to blas but with numpy/scipy binding to blas)"
elif any(x.op.__class__.__name__ == "GpuGemm" for x in f.maker.fgraph.toposort()):
impl = "GPU"
else: else:
impl = "ERROR, unable to tell if Aesara used the cpu or the gpu:\n" impl = "ERROR, unable to tell if Aesara used the cpu:\n"
impl += str(f.maker.fgraph.toposort()) impl += str(f.maker.fgraph.toposort())
t0 = 0 t0 = 0
...@@ -78,7 +76,7 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order= ...@@ -78,7 +76,7 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order=
f() # Ignore first function call to get representative time. f() # Ignore first function call to get representative time.
if execute: if execute:
# sync was needed for gpu # NOTE: sync was needed for gpu
sync = False sync = False
if sync: if sync:
......
...@@ -1014,8 +1014,6 @@ Print to the terminal a math-like expression. ...@@ -1014,8 +1014,6 @@ Print to the terminal a math-like expression.
# colors not used: orange, amber#FFBF00, purple, pink, # colors not used: orange, amber#FFBF00, purple, pink,
# used by default: green, blue, grey, red # used by default: green, blue, grey, red
default_colorCodes = { default_colorCodes = {
"GpuFromHost": "red",
"HostFromGpu": "red",
"Scan": "yellow", "Scan": "yellow",
"Shape": "brown", "Shape": "brown",
"IfElse": "magenta", "IfElse": "magenta",
......
...@@ -19,8 +19,7 @@ from aesara.tensor.type import zmatrix ...@@ -19,8 +19,7 @@ from aesara.tensor.type import zmatrix
message = ( message = (
"The module aesara.sandbox.fourier will soon be deprecated." "The module aesara.sandbox.fourier will soon be deprecated."
" Please use aesara.tensor.fft, which supports gradients and " " Please use aesara.tensor.fft, which supports gradients."
"automatic optimization transfers to the GPU ops."
) )
warnings.warn(message) warnings.warn(message)
......
...@@ -394,13 +394,13 @@ class mrg_uniform(COp, mrg_uniform_base): ...@@ -394,13 +394,13 @@ class mrg_uniform(COp, mrg_uniform_base):
for s in size: for s in size:
n_elements *= s n_elements *= s
if n_elements > M1: if n_elements > M1:
# The limit is on the C and GPU code. This perform don't # The limit is on the C code. This perform don't
# have this limit. But to have all of them behave the # have this limit. But to have all of them behave the
# same (and have DebugMode don't use too much memory for # same (and have DebugMode don't use too much memory for
# some rng_mrg tests) I also add this limit here. # some rng_mrg tests) I also add this limit here.
raise ValueError("rng_mrg does not support more then (2**31 -1) samples") raise ValueError("rng_mrg does not support more then (2**31 -1) samples")
rstate = np.asarray(rstate) # bring state from GPU if necessary rstate = np.asarray(rstate) # bring state from XXX if necessary
if not self.inplace: if not self.inplace:
rstate = rstate.copy() rstate = rstate.copy()
...@@ -527,8 +527,7 @@ class mrg_uniform(COp, mrg_uniform_base): ...@@ -527,8 +527,7 @@ class mrg_uniform(COp, mrg_uniform_base):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
# If we try to use the C code here with something else than a # If we try to use the C code here with something else than a
# TensorType, something is wrong (likely one of the GPU ops # TensorType, something is wrong.
# not defining C code correctly).
assert isinstance(node.inputs[0].type, TensorType) assert isinstance(node.inputs[0].type, TensorType)
if self.output_type.dtype == "float16": if self.output_type.dtype == "float16":
# C code is not tested, fall back to Python # C code is not tested, fall back to Python
......
...@@ -26,8 +26,6 @@ of using ``scan`` over `for` loops in python (among others) are: ...@@ -26,8 +26,6 @@ of using ``scan`` over `for` loops in python (among others) are:
* it allows computing gradients through the for loop * it allows computing gradients through the for loop
* there exist a bunch of optimizations that help re-write your loop * there exist a bunch of optimizations that help re-write your loop
such that less memory is used and that it runs faster such that less memory is used and that it runs faster
* it ensures that data is not copied from host to gpu and gpu to
host at each step
The Scan Op should typically be used by calling any of the following The Scan Op should typically be used by calling any of the following
functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``, functions: ``scan()``, ``map()``, ``reduce()``, ``foldl()``,
......
...@@ -277,10 +277,6 @@ def scan( ...@@ -277,10 +277,6 @@ def scan(
allocations are freed at the end of all iterations; this is what the allocations are freed at the end of all iterations; this is what the
flag `aesara.config.allow_gc` means. flag `aesara.config.allow_gc` means.
If you use pre-allocation and this `Scan` is on GPU, the speed up from
`allow_gc` is small. If you are missing memory, disabling `allow_gc`
could help you run graph that request much memory.
strict strict
If ``True``, all the shared variables used in `fn` must be provided as a If ``True``, all the shared variables used in `fn` must be provided as a
part of `non_sequences` or `sequences`. part of `non_sequences` or `sequences`.
......
...@@ -1714,7 +1714,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph): ...@@ -1714,7 +1714,9 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
elif isinstance(self.fn.maker.fgraph.outputs[idx], TensorVariable): elif isinstance(self.fn.maker.fgraph.outputs[idx], TensorVariable):
old_inner_output_data[idx] = var.data old_inner_output_data[idx] = var.data
else: else:
raise RuntimeError("old_inner_output_data[idx] = var.gpudata") raise RuntimeError(
"FIXME: old_inner_output_data[idx] = var.gpudata"
)
# 4.6. Keep a reference to the variables (ndarrays, # 4.6. Keep a reference to the variables (ndarrays,
# etc) associated with mitmot inputs currently in the # etc) associated with mitmot inputs currently in the
...@@ -1849,7 +1851,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph): ...@@ -1849,7 +1851,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
output_reused = new_var.data == old_data output_reused = new_var.data == old_data
else: else:
raise RuntimeError( raise RuntimeError(
"output_reused = new_var.gpudata == old_data" "FIXME: output_reused = new_var.gpudata == old_data"
) )
else: else:
output_reused = False output_reused = False
...@@ -1915,7 +1917,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph): ...@@ -1915,7 +1917,7 @@ class Scan(Op, ScanMethodsMixin, HasInnerGraph):
output_reused = new_var.data == old_data output_reused = new_var.data == old_data
else: else:
raise RuntimeError( raise RuntimeError(
"output_reused = new_var.gpudata == old_data" "FIXME: output_reused = new_var.gpudata == old_data"
) )
else: else:
output_reused = False output_reused = False
......
...@@ -1649,11 +1649,6 @@ class Alloc(COp): ...@@ -1649,11 +1649,6 @@ class Alloc(COp):
) )
): ):
return False return False
# If the clients is a transfer to the GPU, we don't want to
# fold. We let the Alloc being moved to the GPU, then we
# let the GPU algo decide if it need to fold it or not.
elif client[0].op.__class__.__name__.lower().startswith("gpu"):
return False
return True return True
...@@ -2215,8 +2210,7 @@ def addbroadcast(x, *axes): ...@@ -2215,8 +2210,7 @@ def addbroadcast(x, *axes):
x broadcastable. When performing the function, if the length of x broadcastable. When performing the function, if the length of
x along that dimension is not 1, a ValueError will be raised. x along that dimension is not 1, a ValueError will be raised.
We apply the opt here not to pollute the graph especially during We apply the opt here not to pollute the graph
the gpu optimization
Parameters Parameters
---------- ----------
...@@ -2252,8 +2246,7 @@ def unbroadcast(x, *axes): ...@@ -2252,8 +2246,7 @@ def unbroadcast(x, *axes):
of x broadcastable. When performing the function, if the length of x broadcastable. When performing the function, if the length
of x along that dimension is not 1, a ValueError will be raised. of x along that dimension is not 1, a ValueError will be raised.
We apply the opt here not to pollute the graph especially during We apply the opt here not to pollute the graph
the gpu optimization
Parameters Parameters
---------- ----------
......
...@@ -169,7 +169,7 @@ def broadcast_like(value, template, fgraph, dtype=None): ...@@ -169,7 +169,7 @@ def broadcast_like(value, template, fgraph, dtype=None):
class InplaceElemwiseOptimizer(GlobalOptimizer): class InplaceElemwiseOptimizer(GlobalOptimizer):
r""" r"""
This is parameterized so that it works for `Elemwise` and `GpuElemwise` `Op`\s. This is parameterized so that it works for `Elemwise` `Op`\s.
""" """
def __init__(self, OP): def __init__(self, OP):
...@@ -1343,8 +1343,7 @@ class ShapeFeature(features.Feature): ...@@ -1343,8 +1343,7 @@ class ShapeFeature(features.Feature):
if repl.owner is shpnode: if repl.owner is shpnode:
# This mean the replacement shape object is # This mean the replacement shape object is
# exactly the same as the current shape object. So # exactly the same as the current shape object. So
# no need for replacement. This happen for example # no need for replacement.
# with the InputToGpuOptimizer optimizer.
continue continue
if ( if (
repl.owner repl.owner
...@@ -1841,9 +1840,7 @@ def local_alloc_empty_to_zeros(fgraph, node): ...@@ -1841,9 +1840,7 @@ def local_alloc_empty_to_zeros(fgraph, node):
This help investigate NaN with NanGuardMode. Not registered by This help investigate NaN with NanGuardMode. Not registered by
default. To activate it, use the Aesara flag default. To activate it, use the Aesara flag
optimizer_including=alloc_empty_to_zeros. This also enable optimizer_including=alloc_empty_to_zeros.
the GPU version of this optimizations.
""" """
if isinstance(node.op, AllocEmpty): if isinstance(node.op, AllocEmpty):
return [zeros(node.inputs, dtype=node.outputs[0].dtype)] return [zeros(node.inputs, dtype=node.outputs[0].dtype)]
...@@ -3000,19 +2997,15 @@ def local_elemwise_fusion_op(op_class, max_input_fct=lambda node: 32, maker=None ...@@ -3000,19 +2997,15 @@ def local_elemwise_fusion_op(op_class, max_input_fct=lambda node: 32, maker=None
and each `Elemwise`'s scalar `Op`, and use the composite scalar `Op` in a and each `Elemwise`'s scalar `Op`, and use the composite scalar `Op` in a
new "fused" `Elemwise`. new "fused" `Elemwise`.
It's parameterized in order to work for `Elemwise` and `GpuElemwise` `Op`\s. It's parameterized in order to work for `Elemwise` `Op`\s.
Parameters Parameters
---------- ----------
op_class : type op_class : type
`GpuElemwise` or `Elemwise` class (the one that we want to fuse) `Elemwise` class (the one that we want to fuse)
max_input_fct : callable max_input_fct : callable
A function that returns the maximum number of inputs that this `Elemwise` A function that returns the maximum number of inputs that this `Elemwise`
can take (useful for `GpuElemwise`). The GPU kernel currently has a can take.
limit of 256 bytes for the size of all parameters passed to it. As
currently we pass a lot of information only by parameter, we must limit how
many `Op`\s we fuse together to avoid busting that 256 limit.
On the CPU we limit to 32 input variables since that is the maximum On the CPU we limit to 32 input variables since that is the maximum
NumPy support. NumPy support.
......
...@@ -1192,7 +1192,7 @@ def to_one_hot(y, nb_class, dtype=None): ...@@ -1192,7 +1192,7 @@ def to_one_hot(y, nb_class, dtype=None):
class Unique(Op): class Unique(Op):
""" """
Wraps `numpy.unique`. This `Op` is not implemented on the GPU. Wraps `numpy.unique`.
Examples Examples
-------- --------
......
...@@ -22,8 +22,7 @@ from aesara.tensor.var import TensorConstant ...@@ -22,8 +22,7 @@ from aesara.tensor.var import TensorConstant
class Fourier(Op): class Fourier(Op):
""" """
WARNING: for officially supported FFTs, use aesara.tensor.fft, which WARNING: for officially supported FFTs, use aesara.tensor.fft, which
provides real-input FFTs. Gradients are supported, as well as optimization provides real-input FFTs. Gradients are supported.
transfers to GPU ops.
An instance of this class returns a finite fourier transform calculated An instance of this class returns a finite fourier transform calculated
along one dimension of an input array. along one dimension of an input array.
......
...@@ -1550,11 +1550,6 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, acc_dtype=None) ...@@ -1550,11 +1550,6 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False, acc_dtype=None)
necessarily be the dtype of the output (in particular necessarily be the dtype of the output (in particular
if it is a discrete (int/uint) dtype, the output will if it is a discrete (int/uint) dtype, the output will
be in a float type). If None, then we use the same rules as `sum()`. be in a float type). If None, then we use the same rules as `sum()`.
Notes
-----
For gpu, if you specify dtype=float32, everything will be done on the gpu.
""" """
input = as_tensor_variable(input) input = as_tensor_variable(input)
if op: if op:
......
...@@ -1673,8 +1673,7 @@ def local_reduce_broadcastable(fgraph, node): ...@@ -1673,8 +1673,7 @@ def local_reduce_broadcastable(fgraph, node):
axis = list(node.op.axis) axis = list(node.op.axis)
cuttable = [a for a in axis if reduced.broadcastable[a]] cuttable = [a for a in axis if reduced.broadcastable[a]]
if cuttable: if cuttable:
# -- we can remove some axes of summation, # -- we can remove some axes of summation.
# which simplifies the codegen for sum, especially on GPU
new_axis = [] new_axis = []
pattern = [] pattern = []
ii = 0 ii = 0
...@@ -1857,10 +1856,6 @@ def local_pow_canonicalize(fgraph, node): ...@@ -1857,10 +1856,6 @@ def local_pow_canonicalize(fgraph, node):
def local_mul_to_sqr(fgraph, node): def local_mul_to_sqr(fgraph, node):
""" """
x*x -> sqr(x) x*x -> sqr(x)
This is faster on the GPU when memory fetching is a big part of
the computation time.
""" """
if node.op == mul: if node.op == mul:
if len(node.inputs) == 2: if len(node.inputs) == 2:
......
...@@ -193,11 +193,6 @@ def matrix_dot(*args): ...@@ -193,11 +193,6 @@ def matrix_dot(*args):
def trace(X): def trace(X):
""" """
Returns the sum of diagonal elements of matrix X. Returns the sum of diagonal elements of matrix X.
Notes
-----
Works on GPU since 0.6rc4.
""" """
return extract_diag(X).sum() return extract_diag(X).sum()
...@@ -729,7 +724,6 @@ class TensorInv(Op): ...@@ -729,7 +724,6 @@ class TensorInv(Op):
def tensorinv(a, ind=2): def tensorinv(a, ind=2):
""" """
Does not run on GPU;
Aesara utilization of numpy.linalg.tensorinv; Aesara utilization of numpy.linalg.tensorinv;
Compute the 'inverse' of an N-dimensional array. Compute the 'inverse' of an N-dimensional array.
...@@ -791,7 +785,7 @@ class TensorSolve(Op): ...@@ -791,7 +785,7 @@ class TensorSolve(Op):
def tensorsolve(a, b, axes=None): def tensorsolve(a, b, axes=None):
""" """
Aesara utilization of numpy.linalg.tensorsolve. Does not run on GPU! Aesara utilization of numpy.linalg.tensorsolve.
Solve the tensor equation ``a x = b`` for x. Solve the tensor equation ``a x = b`` for x.
It is assumed that all indices of `x` are summed over in the product, It is assumed that all indices of `x` are summed over in the product,
......
...@@ -1048,16 +1048,6 @@ def conv3d( ...@@ -1048,16 +1048,6 @@ def conv3d(
Set of feature maps generated by convolutional layer. Tensor is Set of feature maps generated by convolutional layer. Tensor is
is of shape (batch size, output channels, output depth, is of shape (batch size, output channels, output depth,
output rows, output columns) output rows, output columns)
Notes
-----
If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
"caffe style convolution".
This is only supported in Aesara 0.8 or the development
version until it is released.
""" """
input = as_tensor_variable(input) input = as_tensor_variable(input)
filters = as_tensor_variable(filters) filters = as_tensor_variable(filters)
...@@ -1184,17 +1174,6 @@ def conv2d_grad_wrt_inputs( ...@@ -1184,17 +1174,6 @@ def conv2d_grad_wrt_inputs(
set of feature maps generated by convolutional layer. Tensor set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output rows, output is of shape (batch size, output channels, output rows, output
columns) columns)
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
""" """
filters = as_tensor_variable(filters) filters = as_tensor_variable(filters)
...@@ -1347,17 +1326,6 @@ def conv3d_grad_wrt_inputs( ...@@ -1347,17 +1326,6 @@ def conv3d_grad_wrt_inputs(
set of feature maps generated by convolutional layer. Tensor set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output depth, is of shape (batch size, output channels, output depth,
output rows, output columns) output rows, output columns)
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
""" """
filters = as_tensor_variable(filters) filters = as_tensor_variable(filters)
...@@ -1500,17 +1468,6 @@ def conv2d_grad_wrt_weights( ...@@ -1500,17 +1468,6 @@ def conv2d_grad_wrt_weights(
columns) for normal convolution and columns) for normal convolution and
(output channels, output rows, output columns, input channels, (output channels, output rows, output columns, input channels,
filter rows, filter columns) for unshared convolution filter rows, filter columns) for unshared convolution
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
""" """
input = as_tensor_variable(input) input = as_tensor_variable(input)
...@@ -1644,17 +1601,6 @@ def conv3d_grad_wrt_weights( ...@@ -1644,17 +1601,6 @@ def conv3d_grad_wrt_weights(
set of feature maps generated by convolutional layer. Tensor set of feature maps generated by convolutional layer. Tensor
is of shape (batch size, output channels, output time, output is of shape (batch size, output channels, output time, output
rows, output columns) rows, output columns)
Notes
-----
:note: If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
"caffe style convolution".
:note: This is only supported in Aesara 0.8 or the development
version until it is released.
""" """
input = as_tensor_variable(input) input = as_tensor_variable(input)
...@@ -3685,19 +3631,6 @@ def conv2d( ...@@ -3685,19 +3631,6 @@ def conv2d(
Symbolic 4D tensor Symbolic 4D tensor
Set of feature maps generated by convolutional layer. Tensor is Set of feature maps generated by convolutional layer. Tensor is
of shape (batch size, output channels, output rows, output columns) of shape (batch size, output channels, output rows, output columns)
Notes
-----
If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
This is only supported in Aesara 0.8 or the development
version until it is released.
The parameter filter_dilation is an implementation of `dilated
convolution <https://arxiv.org/pdf/1511.07122v3.pdf>`_.
""" """
if "imshp_logical" in kwargs or "kshp_logical" in kwargs: if "imshp_logical" in kwargs or "kshp_logical" in kwargs:
...@@ -3822,18 +3755,6 @@ def conv2d_transpose( ...@@ -3822,18 +3755,6 @@ def conv2d_transpose(
Symbolic 4D tensor Symbolic 4D tensor
Set of feature maps generated by the transposed convolution. Tensor is Set of feature maps generated by the transposed convolution. Tensor is
of shape (batch size, output channels, output rows, output columns) of shape (batch size, output channels, output rows, output columns)
Notes
-----
If cuDNN is available, it will be used on the
GPU. Otherwise, it is the *CorrMM* convolution that will be used
"caffe style convolution".
This operation is also sometimes called "deconvolution".
The parameter filter_dilation is an implementation of `dilated
convolution <https://arxiv.org/pdf/1511.07122v3.pdf>`_.
""" """
return conv2d_grad_wrt_inputs( return conv2d_grad_wrt_inputs(
......
...@@ -4,13 +4,6 @@ Provides neural-network specific Ops. ...@@ -4,13 +4,6 @@ Provides neural-network specific Ops.
Notes Notes
----- -----
TODO: factor this out into a neural-network toolbox. TODO: factor this out into a neural-network toolbox.
We register all optimization with the gpu tag as we don't
implement all the intermediate case on the GPU (in particular
AdvancedSubtensor). So to make sure it run well on the gpu with
fast_compile, we register them as needed for the GPU. This can be
revisited later when all the intermediate part are on the GPU.
""" """
import warnings import warnings
......
...@@ -47,7 +47,6 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"): ...@@ -47,7 +47,6 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"):
""" """
This function will build the symbolic graph for applying batch normalization This function will build the symbolic graph for applying batch normalization
to a set of activations. to a set of activations.
Also works on GPUs, but is not optimized using cuDNN.
.. versionadded:: 0.7.1 .. versionadded:: 0.7.1
......
...@@ -44,9 +44,6 @@ class DiagonalSubtensor(Op): ...@@ -44,9 +44,6 @@ class DiagonalSubtensor(Op):
i1 i1
Axis index in x Axis index in x
Notes
-----
Work on the GPU.
Extended summary Extended summary
---------------- ----------------
...@@ -204,8 +201,6 @@ def conv3d( ...@@ -204,8 +201,6 @@ def conv3d(
Another way to define signals: (batch, time, in channel, row, column) Another way to define signals: (batch, time, in channel, row, column)
Another way to define filters: (out channel,time,in channel, row, column) Another way to define filters: (out channel,time,in channel, row, column)
For the GPU, use nnet.conv3d.
See Also See Also
-------- --------
Someone made a script that shows how to swap the axes between Someone made a script that shows how to swap the axes between
......
...@@ -194,7 +194,7 @@ class Images2Neibs(COp): ...@@ -194,7 +194,7 @@ class Images2Neibs(COp):
def perform(self, node, inp, out_, params): def perform(self, node, inp, out_, params):
ten4, neib_shape, neib_step = inp ten4, neib_shape, neib_step = inp
(z,) = out_ (z,) = out_
# GpuImages2Neibs should not run this perform in DebugMode # XXX: GpuImages2Neibs should not run this perform in DebugMode
if not isinstance(self, Images2Neibs): if not isinstance(self, Images2Neibs):
raise aesara.graph.utils.MethodNotDefined() raise aesara.graph.utils.MethodNotDefined()
......
...@@ -591,9 +591,7 @@ def local_abstractconv_check(fgraph, node): ...@@ -591,9 +591,7 @@ def local_abstractconv_check(fgraph, node):
): ):
raise LocalMetaOptimizerSkipAssertionError( raise LocalMetaOptimizerSkipAssertionError(
f"{node.op.__class__.__name__} Aesara optimization failed: there is no implementation " f"{node.op.__class__.__name__} Aesara optimization failed: there is no implementation "
"available supporting the requested options. Did you exclude " "available supporting the requested options. If on CPU, "
'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, '
"is cuDNN available and does the GPU support it? If on CPU, "
"do you have a BLAS library installed Aesara can link against? " "do you have a BLAS library installed Aesara can link against? "
"On the CPU we do not support float16." "On the CPU we do not support float16."
) )
......
...@@ -146,12 +146,7 @@ def pool_2d( ...@@ -146,12 +146,7 @@ def pool_2d(
"pool_2d() will have the parameter ignore_border" "pool_2d() will have the parameter ignore_border"
" default value changed to True (currently" " default value changed to True (currently"
" False). To have consistent behavior with all Aesara" " False). To have consistent behavior with all Aesara"
" version, explicitly add the parameter ignore_border=True." " version, explicitly add the parameter ignore_border=True.",
" On the GPU, using ignore_border=True is needed to use cuDNN."
" When using ignore_border=False and not using cuDNN, the only"
" GPU combination supported is when"
" `ws == stride and pad == (0, 0) and mode == 'max'`."
" Otherwise, the convolution will be executed on CPU.",
category=DeprecationWarning, category=DeprecationWarning,
stacklevel=2, stacklevel=2,
) )
...@@ -267,12 +262,7 @@ def pool_3d( ...@@ -267,12 +262,7 @@ def pool_3d(
"pool_3d() will have the parameter ignore_border" "pool_3d() will have the parameter ignore_border"
" default value changed to True (currently" " default value changed to True (currently"
" False). To have consistent behavior with all Aesara" " False). To have consistent behavior with all Aesara"
" version, explicitly add the parameter ignore_border=True." " version, explicitly add the parameter ignore_border=True.",
" On the GPU, using ignore_border=True is needed to use cuDNN."
" When using ignore_border=False and not using cuDNN, the only"
" GPU combination supported is when"
" `ws == stride and pad == (0, 0, 0) and mode == 'max'`."
" Otherwise, the convolution will be executed on CPU.",
category=DeprecationWarning, category=DeprecationWarning,
stacklevel=2, stacklevel=2,
) )
......
...@@ -429,8 +429,6 @@ def solve_triangular( ...@@ -429,8 +429,6 @@ def solve_triangular(
class Solve(SolveBase): class Solve(SolveBase):
""" """
Solve a system of linear equations. Solve a system of linear equations.
For on CPU and GPU.
""" """
__props__ = ( __props__ = (
......
...@@ -328,12 +328,10 @@ class TopKOp(Op): ...@@ -328,12 +328,10 @@ class TopKOp(Op):
Notes Notes
----- -----
- CPU and GPU ops don't produce same output order. This is expected.
- The output order is not guaranteed. On the CPU, we use - The output order is not guaranteed. On the CPU, we use
``np.partition`` and ``np.argpartition`` that only make sure the ``np.partition`` and ``np.argpartition`` that only make sure the
k-th element is the correct one and that the other k-th element is the correct one and that the other
elements are on the correct side. On the GPU, they elements are on the correct side.
look sorted, but we do not test the correctness of this behavior.
- By default, this Op gives two outputs: values and indices. However - By default, this Op gives two outputs: values and indices. However
optimizers may remove a certain output if not needed. optimizers may remove a certain output if not needed.
- Computing the gradient requests the computation of the indices in - Computing the gradient requests the computation of the indices in
......
...@@ -50,15 +50,10 @@ class OrderedUpdates(OrderedDict): ...@@ -50,15 +50,10 @@ class OrderedUpdates(OrderedDict):
# TODO: consider doing error-checking on value. # TODO: consider doing error-checking on value.
# insist that it is an Aesara variable? Have the right type? # insist that it is an Aesara variable? Have the right type?
# This could have weird consequences - for example a # This could have weird consequences - for example a
# GPU SharedVariable is customarily associated with a TensorType
# value. Should it be cast to a GPU value right away? Should
# literals be transformed into constants immediately?
return super().__setitem__(key, value) return super().__setitem__(key, value)
else: else:
raise TypeError( raise TypeError("OrderedUpdates keys must inherit from SharedVariable", key)
"OrderedUpdates keys must inherit from " "SharedVariable", key
)
def update(self, other=None): def update(self, other=None):
if other is None: if other is None:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论