Replace use of T with aet

c058326d · Brandon T. Willard · Brandon T. Willard · 4a8ccb6d · c058326d · c058326d
--- a/aesara/assert_op.py
+++ b/aesara/assert_op.py
@@ -25,9 +25,10 @@ class Assert(COp):
    --------
    >>> import aesara
    >>> import aesara.tensor as aet
-    >>> x = aet.vector('x')
-    >>> assert_op = aet.opt.Assert()
-    >>> func = aesara.function([x], assert_op(x, x.size<2))
+    >>> from aesara.assert_op import Assert
+    >>> x = aet.vector("x")
+    >>> assert_op = Assert("This assert failed")
+    >>> func = aesara.function([x], assert_op(x, x.size < 2))

    """


--- a/aesara/gpuarray/dnn.py
+++ b/aesara/gpuarray/dnn.py
@@ -3379,10 +3379,10 @@ def dnn_batch_normalization_train(
        axes = 0 if mode == 'per-activation' else (0, 2, 3)
        mean = inputs.mean(axes, keepdims=True)
        var = inputs.var(axes, keepdims=True)
-        invstd = T.inv(T.sqrt(var + epsilon))
+        invstd = aet.inv(aet.sqrt(var + epsilon))
        out = (inputs - mean) * gamma * invstd + beta

-        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        m = aet.cast(aet.prod(inputs.shape) / aet.prod(mean.shape), 'float32')
        running_mean = running_mean * (1 - running_average_factor) + \\
                       mean * running_average_factor
        running_var = running_var * (1 - running_average_factor) + \\
@@ -3511,9 +3511,9 @@ def dnn_batch_normalization_test(
    .. code-block:: python

        axes = (0,) if mode == 'per-activation' else (0, 2, 3)
-        gamma, beta, mean, var = (T.addbroadcast(t, *axes)
+        gamma, beta, mean, var = (aet.addbroadcast(t, *axes)
                                  for t in (gamma, beta, mean, var))
-        out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
+        out = (inputs - mean) * gamma / aet.sqrt(var + epsilon) + beta

    For 5d tensors, the axes would be (0, 2, 3, 4).
    """

--- a/aesara/tensor/basic.py
+++ b/aesara/tensor/basic.py
@@ -3420,7 +3420,7 @@ class _nd_grid:

    Examples
    --------
-    >>> a = T.mgrid[0:5, 0:3]
+    >>> a = aet.mgrid[0:5, 0:3]
    >>> a[0].eval()
    array([[0, 0, 0],
           [1, 1, 1],
@@ -3433,7 +3433,7 @@ class _nd_grid:
           [0, 1, 2],
           [0, 1, 2],
           [0, 1, 2]], dtype=int8)
-    >>> b = T.ogrid[0:5, 0:3]
+    >>> b = aet.ogrid[0:5, 0:3]
    >>> b[0].eval()
    array([[0],
           [1],
@@ -3853,45 +3853,28 @@ def diagonal(a, offset=0, axis1=0, axis2=1):


 class AllocDiag(Op):
-    """
-    An op that copies a vector to the diagonal of an empty matrix. It does the
-    inverse of ExtractDiag.
-
-    Usage: T.AllocDiag()(x)
-
-    `x` should be a tensor vector. The parenthesis in the front should indicate
-    which main diagonal the vector value goes into. By default it is set to
-    `0`, which corresponds to setting the values of x to the main diagonal in
-    the returned matrix.
-
-    Parameters
-    ----------
-    axis1: Axis to be used as the first axis of the 2-D
-        sub-arrays to which the diagonals will be allocated.
-        Defaults to first axis (0).
-
-    axis2: Axis to be used as the second axis of the 2-D
-        sub-arrays to which the diagonals will be allocated.
-        Defaults to second axis (1).
-
-    offset: Offset of the diagonal from the main diagonal defined by `axis1`
-        and `axis2`.
-        Can be positive or negative.
-        Defaults to main diagonal (0).
-
-    x: symbolic vector
-        A tensor vector consists of diagonal values.
-
-    Returns
-    -------
-    tensor : symbolic tenstor
-        A tensor with passed tensor values at their corresponding diagonals.
+    """An `Op` that copies a vector to the diagonal of an empty matrix.

+    It does the inverse of `ExtractDiag`.
    """

    __props__ = ("offset", "axis1", "axis2")

    def __init__(self, offset=0, axis1=0, axis2=1):
+        """
+        Parameters
+        ----------
+        offset: int
+            Offset of the diagonal from the main diagonal defined by `axis1`
+            and `axis2`. Can be positive or negative.  Defaults to main
+            diagonal (i.e. 0).
+        axis1: int
+            Axis to be used as the first axis of the 2-D sub-arrays to which
+            the diagonals will be allocated.  Defaults to first axis (i.e. 0).
+        axis2: int
+            Axis to be used as the second axis of the 2-D sub-arrays to which
+            the diagonals will be allocated.  Defaults to second axis (i.e. 1).
+        """
        self.offset = offset
        self.axis1 = axis1
        self.axis2 = axis2

--- a/aesara/tensor/basic_opt.py
+++ b/aesara/tensor/basic_opt.py
@@ -810,7 +810,7 @@ class ShapeFeature(toolbox.Feature):
    2. to infer the shape of every node in the graph in terms of the
       input shapes.

-    3. remove all fills (T.second, T.fill) from the graph
+    3. remove all fills ``(aet.second, aet.fill)`` from the graph

    Lifting shapes as close to the inputs as possible is important for
    canonicalization because it is very bad form to have to compute
@@ -2236,12 +2236,12 @@ def local_alloc_unary(fgraph, node):
            x = a.owner.inputs[0]
            shp = a.owner.inputs[1:]
            v = node.op(x)
-            # T.alloc does not preserve the stacktrace of v,
+            # aet.alloc does not preserve the stacktrace of v,
            # so we need to copy it over from x.
            copy_stack_trace(node.outputs[0], v)
            ret = alloc(cast(v, node.outputs[0].dtype), *shp)

-            # T.cast does not preserve the stacktrace of x,
+            # aet.cast does not preserve the stacktrace of x,
            # so we need to copy it over to the output.
            copy_stack_trace([node.outputs[0], a], ret)
            return [ret]
@@ -3132,14 +3132,11 @@ def local_subtensor_of_alloc(fgraph, node):
 @register_specialize
 @local_optimizer([Subtensor])
 def local_subtensor_of_dot(fgraph, node):
-    """
-    This optimization translates T.dot(A, B)[idxs] into T.dot(A[idxs_a], B[idxs_b]),
-    where idxs_a and idxs_b are defined appropriately.
+    """Rewrite ``aet.dot(A, B)[idxs]`` into ``aet.dot(A[idxs_a], B[idxs_b])``.

-    idxs_a is the first A.ndim-1 entries of idxs,
-    and idxs_b is the remaining entries of idxs (if any),
-    modified to skip the second-to-last dimension of B
-    (because dot sums over this dimension).
+    ``idxs_a`` is the first ``A.ndim-1`` entries of ``idxs``, and ``idxs_b`` is
+    the remaining entries of ``idxs`` (if any), modified to skip the
+    second-to-last dimension of ``B`` (because dot sums over this dimension).

    """
    if not isinstance(node.op, Subtensor):
@@ -3535,7 +3532,7 @@ def local_useless_inc_subtensor_alloc(fgraph, node):
        i = node.inputs[2:]

        if y.owner is not None and isinstance(y.owner.op, Alloc):
-            # `z` is the input of the Alloc op, i.e. T.alloc(z, <shape>)
+            # `z` is the input of the Alloc op, i.e. aet.alloc(z, <shape>)
            z = y.owner.inputs[0]

            try:
@@ -3803,7 +3800,7 @@ def local_join_empty(fgraph, node):
        new_inputs.append(inp)
    if len(new_inputs) < len(node.inputs) - 1:
        if len(new_inputs) == 0:
-            # T.join do not work in that case.
+            # aet.join do not work in that case.
            # constant folding will take care of this case.
            return
        ret = join(node.inputs[0], *new_inputs)
@@ -3880,12 +3877,16 @@ def local_join_make_vector(fgraph, node):
 def local_useless_switch(fgraph, node):
    """
    This optimization makes the following changes in the graph:
-        T.switch(cond,left,right) -->
-               if cond is constant and cond == 0: right
-               if cond is constant and cond != 0: left
-               if left is right -> left

-        T.switch(le(shape_i{id}(X), 0), 0, shape_i{id}(X)) -> shape_i{id}(X)
+    ``aet.switch(cond, left, right)`` ->
+            ``if cond is constant and cond == 0``: right
+            ``if cond is constant and cond != 0``: left
+            ``if left is right`` -> ``left``
+
+    and
+
+    ``aet.switch(le(shape_i{id}(X), 0), 0, shape_i{id}(X))`` -> ``shape_i{id}(X)``
+
    """
    if isinstance(node.op, Elemwise) and isinstance(node.op.scalar_op, aes.Switch):


--- a/aesara/tensor/blas.py
+++ b/aesara/tensor/blas.py
@@ -1111,7 +1111,7 @@ def res_is_a(fgraph, var, op, maxclients=None):


 def _as_scalar(res, dtype=None):
-    """Return None or a TensorVariable whose type is in T.float_scalar_types"""
+    """Return ``None`` or a `TensorVariable` whose type is in `float_scalar_types`"""
    if dtype is None:
        dtype = config.floatX
    if np.all(res.type.broadcastable):

--- a/aesara/tensor/math.py
+++ b/aesara/tensor/math.py
@@ -2490,14 +2490,14 @@ class Prod(CAReduceDtype):
        Implementing that case-by-case logic is not as trivial, so a bunch of
        hacks are piled down here to do it. Notably, for the "only one zero"
        case, there's a special Op that computes the product of the elements
-        in the group, minus the zero (see ProdWithoutZero). The trick is then
+        in the group, minus the zero (see `ProdWithoutZeros`). The trick is then
        to use the division trick for groups with no zero, to use the
-        ProdWithoutZeros op where there's only one zero, and to output a
+        `ProdWithoutZeros` op where there's only one zero, and to output a
        derivative of zero for any element part of a group with more than
        one zero.

-        I do this by first counting the number of zeros in each group (see
-        the "T.eq()" bits), then taking this or that behavior (see T.switch)
+        I do this by first counting the number of zeros in each group (see the
+        `aet.eq` bits), then taking this or that behavior (see `aet.switch`)
        based on the result of this count.

        """
@@ -2532,7 +2532,7 @@ class Prod(CAReduceDtype):
        gz = gz.dimshuffle(new_dims)

        # division trick if we don't have zeros. This will contain
-        # NaNs to be eliminated in the T.switch if we do have zeros.
+        # NaNs to be eliminated in the `aet.switch` if we do have zeros.
        grad_case_without_zeros = gz * prod_out / prod_in

        if self.no_zeros_in_input:

--- a/aesara/tensor/math_opt.py
+++ b/aesara/tensor/math_opt.py
@@ -148,8 +148,7 @@ def local_0_dot_x(fgraph, node):
 @register_canonicalize
 @local_optimizer([DimShuffle])
 def local_lift_transpose_through_dot(fgraph, node):
-    """
-    dot(x,y).T -> dot(y.T, x.T)
+    """Perform the rewrite ``dot(x,y).T -> dot(y.T, x.T)``

    These optimizations "lift" (propagate towards the inputs) DimShuffle
    through dot product.  It allows to put the graph in a more standard shape,
@@ -231,8 +230,9 @@ def local_func_inv(fgraph, node):
 @local_optimizer([Sum])
 def local_sumsqr2dot(fgraph, node):
    """
-    This optimization detects T.sqr( W.dimshuffle('x',0,1) * G.dimshuffle(0,'x',1) ).sum(axis=(1,2))
-     and converts this to T.dot(T.sqr(G), T.sqr(W).sum(axis=0)).
+    This optimization detects
+    ``aet.sqr(W.dimshuffle("x", 0, 1) * G.dimshuffle(0, "x", 1) ).sum(axis=(1, 2))``
+    and converts it to ``aet.dot(aet.sqr(G), aet.sqr(W).sum(axis=0))``.
    """
    if (
        isinstance(node.op, Sum)
@@ -305,24 +305,30 @@ def local_expm1(fgraph, node):
 def local_mul_switch_sink(fgraph, node):
    """
    This optimization makes the following changes in the graph:
-    T.mul(A,T.switch(cond,0,iff),B) -->  T.switch(cond,0,T.mul(A,B,iff))
-    T.mul(A,T.switch(cond,ift,0),B) -->  T.switch(cond,T.mul(A,B,ift),0)
-    A and B being several (or none) symbolic variables.
-    This is useful because A and B may not be numerically stable and give
+    ``aet.mul(A, aet.switch(cond, 0, iff), B)`` -> ``aet.switch(cond, 0, aet.mul(A, B, iff))``
+    ``aet.mul(A, aet.switch(cond, ift, 0), B)`` -> ``aet.switch(cond, aet.mul(A, B, ift), 0)``
+    ``A`` and ``B`` being several (or none) symbolic variables.
+    This is useful because ``A`` and ``B`` may not be numerically stable and give
    NaN or inf values for cases where the switch returns 0.
-    With this optimization T.grad(T.switch(...)) has the right behavior.
+    With this optimization ``aet.grad(aet.switch(...))`` has the right behavior.

    Examples
    --------
-      x -> f(x)
-      x -> g(x)
-      y = T.switch(cond,f(x),g(x))
-      **without the optimization
-      T.grad(y,x) -> grad(f(x),x) * grad(y,f(x)) +  grad(g(x),x) * grad(y,g(x))
-      **with the optimization
-      T.grad(y,x) -> switch(cond,grad(f(x),x), 0) + switch(cond,0,grad(g(x),x))
-    This will be particularly useful for the lazyif because we skip
-    an entire part of the graph.
+
+        x -> f(x)
+        x -> g(x)
+        y = aet.switch(cond, f(x), g(x))
+
+    without the optimization:
+
+        aet.grad(y, x) -> grad(f(x), x) * grad(y, f(x)) + grad(g(x), x) * grad(y, g(x))
+
+    with the optimization
+
+        aet.grad(y, x) -> switch(cond, grad(f(x), x), 0) + switch(cond, 0, grad(g(x), x))
+
+    This will be particularly useful for the lazy ``if`` because we skip an entire
+    part of the graph.

    """
    if node.op != mul:
@@ -393,13 +399,16 @@ def local_mul_switch_sink(fgraph, node):
 def local_div_switch_sink(fgraph, node):
    """
    This optimization makes the following changes in the graph:
-    T.div(T.switch(cond,0,iff),A) -->  T.switch(cond,0,T.div(iff,A))
-    T.div(T.switch(cond,ift,0),A) -->  T.switch(cond,T.div(ift,A),0)

-    A being a symbolic variable.
-    This is useful because A may not be numerically stable and give
-    NaN or inf values for cases where the switch returns 0.
-    See local_mul_switch_sink for more details.
+    ``aet.div(aet.switch(cond, 0, iff), A)`` -> ``aet.switch(cond, 0, aet.div(iff, A))``
+    ``aet.div(aet.switch(cond, ift, 0), A)`` -> ``aet.switch(cond, aet.div(ift, A), 0)``
+
+    where ``A`` is a symbolic variable.
+
+    This is useful because ``A`` may not be numerically stable and give
+    ``nan`` or ``inf`` values for cases where the switch returns 0.
+
+    See `local_mul_switch_sink` for more details.

    """
    if node.op != true_div and node.op != int_div:
@@ -1027,9 +1036,8 @@ def local_sum_prod_mul_by_scalar(fgraph, node):
                # for same reason as above.
                copy_stack_trace(node.outputs, new_op_output)

-            # If node.op is a T.elemwise.Prod, then the scalars need to be
-            # raised to the power of the number of elements in the input
-            # to the Prod
+            # If `node.op` is a `Prod`, then the scalars need to be raised to
+            # the power of the number of elements in the input to the `Prod`
            if isinstance(node.op, Prod) and new_op_input_nb_elements != 1:

                scalars = [s ** new_op_input_nb_elements for s in scalars]

--- a/aesara/tensor/nnet/abstract_conv.py
+++ b/aesara/tensor/nnet/abstract_conv.py
@@ -17,6 +17,7 @@ import warnings
 import numpy as np

 import aesara
+from aesara import tensor as aet
 from aesara.assert_op import Assert
 from aesara.configdefaults import config
 from aesara.graph.basic import Apply, Variable
@@ -560,12 +561,12 @@ def assert_conv_shape(shape):
                assert_shp = Assert(
                    f"The convolution would produce an invalid shape (dim[{int(i)}] < 0)."
                )
-                out_shape.append(assert_shp(n, aesara.tensor.ge(n, 0)))
+                out_shape.append(assert_shp(n, aet.ge(n, 0)))
            else:
                assert_shp = Assert(
                    f"The convolution would produce an invalid shape (dim[{int(i)}] < 0)."
                )
-                out_shape.append(assert_shp(n, aesara.tensor.gt(n, 0)))
+                out_shape.append(assert_shp(n, aet.gt(n, 0)))
    return tuple(out_shape)


@@ -597,7 +598,7 @@ def assert_shape(x, expected_shape, msg="Unexpected shape."):
    tests = []
    for i in range(x.ndim):
        if expected_shape[i] is not None:
-            tests.append(aesara.tensor.eq(shape[i], expected_shape[i]))
+            tests.append(aet.eq(shape[i], expected_shape[i]))
    if tests:
        return Assert(msg)(x, *tests)
    else:
@@ -1862,13 +1863,11 @@ def bilinear_kernel_1D(ratio, normalize=True):
        by the indicated ratio using bilinear interpolation in one dimension.

    """
-
-    T = aesara.tensor
-    half_kern = T.arange(1, ratio + 1, dtype=config.floatX)
-    kern = T.concatenate([half_kern, half_kern[-2::-1]])
+    half_kern = aet.arange(1, ratio + 1, dtype=config.floatX)
+    kern = aet.concatenate([half_kern, half_kern[-2::-1]])

    if normalize:
-        kern /= T.cast(ratio, config.floatX)
+        kern /= aet.cast(ratio, config.floatX)
    return kern


@@ -1903,7 +1902,6 @@ def frac_bilinear_upsampling(input, frac_ratio):
        sides. This does not happen when it is odd.
    """

-    T = aesara.tensor
    row, col = input.shape[2:]
    up_input = input.reshape((-1, 1, row, col))

@@ -1928,15 +1926,15 @@ def frac_bilinear_upsampling(input, frac_ratio):
            subsample = (frac_ratio[1], frac_ratio[1])

    # duplicate borders of the input
-    concat_mat = T.concatenate(
+    concat_mat = aet.concatenate(
        (up_input[:, :, :1, :], up_input, up_input[:, :, -1:, :]), axis=2
    )
-    concat_mat = T.concatenate(
+    concat_mat = aet.concatenate(
        (concat_mat[:, :, :, :1], concat_mat, concat_mat[:, :, :, -1:]), axis=3
    )

    # add padding for the pyramidal kernel
-    double_pad = (2 * T.as_tensor([row, col]) - 1) * np.array(ratio) + 1
+    double_pad = (2 * aet.as_tensor([row, col]) - 1) * np.array(ratio) + 1
    pad = double_pad // 2

    # build pyramidal kernel
@@ -1945,25 +1943,25 @@ def frac_bilinear_upsampling(input, frac_ratio):
    )

    # add corresponding padding
-    pad_kern = T.concatenate(
+    pad_kern = aet.concatenate(
        (
-            T.zeros(
+            aet.zeros(
                tuple(kern.shape[:2]) + (pad[0], kern.shape[-1]),
                dtype=config.floatX,
            ),
            kern,
-            T.zeros(
+            aet.zeros(
                tuple(kern.shape[:2]) + (double_pad[0] - pad[0], kern.shape[-1]),
                dtype=config.floatX,
            ),
        ),
        axis=2,
    )
-    pad_kern = T.concatenate(
+    pad_kern = aet.concatenate(
        (
-            T.zeros(tuple(pad_kern.shape[:3]) + (pad[1],), dtype=config.floatX),
+            aet.zeros(tuple(pad_kern.shape[:3]) + (pad[1],), dtype=config.floatX),
            pad_kern,
-            T.zeros(
+            aet.zeros(
                tuple(pad_kern.shape[:3]) + (double_pad[1] - pad[1],),
                dtype=config.floatX,
            ),
@@ -1972,7 +1970,7 @@ def frac_bilinear_upsampling(input, frac_ratio):
    )

    # upsample the input by passing it as kernel of conv and using filter_dilation
-    upsamp = T.nnet.conv2d(
+    upsamp = conv2d(
        pad_kern,
        concat_mat,
        border_mode="valid",
@@ -2048,7 +2046,6 @@ def bilinear_upsampling(
        return frac_bilinear_upsampling(input, frac_ratio=frac_ratio)

    # the remaining case if integer ratio with use_1D_kernel
-    T = aesara.tensor
    try:
        up_bs = batch_size * num_input_channels
    except TypeError:
@@ -2058,11 +2055,11 @@ def bilinear_upsampling(

    # concatenating the first and last row and column
    # first and last row
-    concat_mat = T.concatenate(
+    concat_mat = aet.concatenate(
        (up_input[:, :, :1, :], up_input, up_input[:, :, -1:, :]), axis=2
    )
    # first and last col
-    concat_mat = T.concatenate(
+    concat_mat = aet.concatenate(
        (concat_mat[:, :, :, :1], concat_mat, concat_mat[:, :, :, -1:]), axis=3
    )
    concat_col = col + 2

--- a/aesara/tensor/nnet/basic.py
+++ b/aesara/tensor/nnet/basic.py
@@ -2081,7 +2081,7 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(fgraph, node):
        assert dy.ndim == 1

        if dy.owner is not None and isinstance(dy.owner.op, aet.Alloc):
-            # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
+            # dz is the input of the Alloc op, i.e. aet.alloc(dz, <shape>)
            dz = dy.owner.inputs[0]

            try:

--- a/aesara/tensor/nnet/batchnorm.py
+++ b/aesara/tensor/nnet/batchnorm.py
@@ -185,10 +185,10 @@ def batch_normalization_train(
        axes = (0,) + tuple(range(2, inputs.ndim))
        mean = inputs.mean(axes, keepdims=True)
        var = inputs.var(axes, keepdims=True)
-        invstd = T.inv(T.sqrt(var + epsilon))
+        invstd = aet.inv(aet.sqrt(var + epsilon))
        out = (inputs - mean) * gamma * invstd + beta

-        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        m = aet.cast(ate.prod(inputs.shape) / aet.prod(mean.shape), 'float32')
        running_mean = running_mean * (1 - running_average_factor) + \\
                       mean * running_average_factor
        running_var = running_var * (1 - running_average_factor) + \\
@@ -332,9 +332,9 @@ def batch_normalization_test(
        axes = (0,)
        # for spatial normalization
        axes = (0,) + tuple(range(2, inputs.ndim))
-        gamma, beta, mean, var = (T.addbroadcast(t, *axes)
+        gamma, beta, mean, var = (aet.addbroadcast(t, *axes)
                                  for t in (gamma, beta, mean, var))
-        out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
+        out = (inputs - mean) * gamma / aet.sqrt(var + epsilon) + beta
    """
    ndim = inputs.ndim
    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)

--- a/tests/tensor/nnet/test_abstract_conv.py
+++ b/tests/tensor/nnet/test_abstract_conv.py
@@ -1920,8 +1920,8 @@ class TestConv2dGrads:

    def test_conv2d_grad_wrt_inputs(self):
        # Compares calculated abstract grads wrt inputs with the fwd grads
-        # This method checks the outputs of conv2_grad_wrt_inputs against
-        # the outputs of T.nnet.conv forward grads to make sure the
+        # This method checks the outputs of `conv2_grad_wrt_inputs` against
+        # the outputs of `aesara.tensor.nnet.conv` forward grads to make sure the
        # results are the same.

        for (in_shape, fltr_shape) in zip(self.inputs_shapes, self.filters_shapes):
@@ -1986,8 +1986,8 @@ class TestConv2dGrads:

    def test_conv2d_grad_wrt_weights(self):
        # Compares calculated abstract grads wrt weights with the fwd grads
-        # This method checks the outputs of conv2_grad_wrt_weights against
-        # the outputs of T.nnet.conv forward grads to make sure the
+        # This method checks the outputs of `conv2_grad_wrt_weights` against
+        # the outputs of `aesara.tensor.nnet.conv` forward grads to make sure the
        # results are the same.

        for (in_shape, fltr_shape) in zip(self.inputs_shapes, self.filters_shapes):

--- a/tests/tensor/test_blas.py
+++ b/tests/tensor/test_blas.py
@@ -906,7 +906,7 @@ def test_gemm_nested():
 def test_gemm_opt_wishlist():
    X, Y, Z, a, b = matrix(), matrix(), matrix(), scalar(), scalar()

-    # with >2 additions of the same T.dot(X,Y term
+    # with >2 additions of the same ``aet.dot(X, Y)`` term
    just_gemm([X, Y, Z, a, b], [(b * b) * Z * a + (a * a) * dot(X, Y) + b * dot(X, Y)])

    just_gemm([X, Y, Z, a, b], [Z + dot(X, Y) + dot(X, Y)])

--- a/tests/tensor/test_math_opt.py
+++ b/tests/tensor/test_math_opt.py
@@ -842,7 +842,7 @@ class TestAlgebraicCanonize:
        # 4 * x / abs(2*x) it get simplifier during canonicalisation.

        x = dscalar()
-        # a = T.abs_(x)
+        # a = aet.abs_(x)

        if config.mode == "FAST_COMPILE":
            mode = get_mode("FAST_RUN").excluding("local_elemwise_fusion")
@@ -2366,7 +2366,6 @@ def test_local_pow_specialize():
    assert len(nodes) == 2
    assert nodes[0] == sqr
    assert isinstance(nodes[1].scalar_op, aes.basic.Inv)
-    #    assert nodes == [T.sqr,T.inv]#Why this don't work?
    utt.assert_allclose(f(val_no0), val_no0 ** (-2))

    f = function([v], v ** (0.5), mode=mode)
@@ -2379,7 +2378,6 @@ def test_local_pow_specialize():
    assert len(nodes) == 2
    assert nodes[0] == sqrt
    assert isinstance(nodes[1].scalar_op, aes.basic.Inv)
-    #    assert nodes == [T.sqrt,T.inv]#Why this don't work?
    utt.assert_allclose(f(val_no0), val_no0 ** (-0.5))



--- a/tests/tensor/test_mlp.py
+++ b/tests/tensor/test_mlp.py
@@ -127,10 +127,10 @@ class LogisticRegression:
              the learning rate is less dependent on the batch size
        """
        # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
-        # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
-        # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
-        # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
-        # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
+        # aet.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
+        # aet.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
+        # LP[aet.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
+        # and aet.mean(LP[aet.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.
        return log(self.p_y_given_x[aet.arange(y.shape[0]), y])