Remove use of `aesara.tensor.nnet` in other tests

e2202bc7 · Rémi Louf · Brandon T. Willard · 4c685afb · e2202bc7 · e2202bc7
--- a/tests/link/jax/test_elemwise.py
+++ b/tests/link/jax/test_elemwise.py
@@ -5,10 +5,9 @@ from aesara.configdefaults import config
 from aesara.graph.fg import FunctionGraph
 from aesara.graph.op import get_test_value
 from aesara.tensor import elemwise as at_elemwise
-from aesara.tensor import nnet as at_nnet
 from aesara.tensor.math import SoftmaxGrad
 from aesara.tensor.math import all as at_all
-from aesara.tensor.math import prod
+from aesara.tensor.math import log_softmax, prod, softmax
 from aesara.tensor.math import sum as at_sum
 from aesara.tensor.type import matrix, tensor, vector
 from tests.link.jax.test_basic import compare_jax_and_py
@@ -76,7 +75,7 @@ def test_jax_CAReduce():
 def test_softmax(axis):
    x = matrix("x")
    x.tag.test_value = np.arange(6, dtype=config.floatX).reshape(2, 3)
-    out = at_nnet.softmax(x, axis=axis)
+    out = softmax(x, axis=axis)
    fgraph = FunctionGraph([x], [out])
    compare_jax_and_py(fgraph, [get_test_value(i) for i in fgraph.inputs])

@@ -85,7 +84,7 @@ def test_softmax(axis):
 def test_logsoftmax(axis):
    x = matrix("x")
    x.tag.test_value = np.arange(6, dtype=config.floatX).reshape(2, 3)
-    out = at_nnet.logsoftmax(x, axis=axis)
+    out = log_softmax(x, axis=axis)
    fgraph = FunctionGraph([x], [out])
    compare_jax_and_py(fgraph, [get_test_value(i) for i in fgraph.inputs])


--- a/tests/link/jax/test_scalar.py
+++ b/tests/link/jax/test_scalar.py
@@ -7,7 +7,6 @@ from aesara.configdefaults import config
 from aesara.graph.fg import FunctionGraph
 from aesara.graph.op import get_test_value
 from aesara.scalar.basic import Composite
-from aesara.tensor import nnet as at_nnet
 from aesara.tensor.elemwise import Elemwise
 from aesara.tensor.math import all as at_all
 from aesara.tensor.math import (
@@ -128,10 +127,6 @@ def test_nnet():
    fgraph = FunctionGraph([x], [out])
    compare_jax_and_py(fgraph, [get_test_value(i) for i in fgraph.inputs])

-    out = at_nnet.ultra_fast_sigmoid(x)
-    fgraph = FunctionGraph([x], [out])
-    compare_jax_and_py(fgraph, [get_test_value(i) for i in fgraph.inputs])
-
    out = softplus(x)
    fgraph = FunctionGraph([x], [out])
    compare_jax_and_py(fgraph, [get_test_value(i) for i in fgraph.inputs])

--- a/tests/scalar/test_basic.py
+++ b/tests/scalar/test_basic.py
@@ -444,7 +444,7 @@ def test_grad_inrange():

 def test_grad_abs():
    a = fscalar("a")
-    b = aesara.tensor.nnet.relu(a)
+    b = 0.5 * (a + aesara.tensor.abs(a))
    c = aesara.grad(b, a)
    f = aesara.function([a], c, mode=Mode(optimizer=None))
    # Currently Aesara return 0.5, but it isn't sure it won't change

--- a/tests/scan/test_basic.py
+++ b/tests/scan/test_basic.py
@@ -43,7 +43,6 @@ from aesara.tensor.math import all as at_all
 from aesara.tensor.math import dot, exp, mean, sigmoid
 from aesara.tensor.math import sum as at_sum
 from aesara.tensor.math import tanh
-from aesara.tensor.nnet import categorical_crossentropy
 from aesara.tensor.random import normal
 from aesara.tensor.random.utils import RandomStream
 from aesara.tensor.shape import Shape_i, reshape, specify_shape
@@ -58,7 +57,6 @@ from aesara.tensor.type import (
    fscalar,
    ftensor3,
    fvector,
-    imatrix,
    iscalar,
    ivector,
    lscalar,
@@ -3810,36 +3808,6 @@ class TestExamples:

        # TODO FIXME: What is this testing?  At least assert something.

-    def test_grad_two_scans(self):
-
-        # data input & output
-        x = tensor3("x")
-        t = imatrix("t")
-
-        # forward pass
-        W = shared(
-            np.random.default_rng(utt.fetch_seed()).random((2, 2)).astype("float32"),
-            name="W",
-            borrow=True,
-        )
-
-        def forward_scanner(x_t):
-            a2_t = dot(x_t, W)
-            y_t = softmax_graph(a2_t)
-            return y_t
-
-        y, _ = scan(fn=forward_scanner, sequences=x, outputs_info=[None])
-
-        # loss function
-        def error_scanner(y_t, t_t):
-            return mean(categorical_crossentropy(y_t, t_t))
-
-        L, _ = scan(fn=error_scanner, sequences=[y, t], outputs_info=[None])
-        L = mean(L)
-
-        # backward pass
-        grad(L, [W])
-
    def _grad_mout_helper(self, n_iters, mode):
        rng = np.random.default_rng(utt.fetch_seed())
        n_hid = 3

--- a/tests/tensor/test_mlp.py
+++ b/tests/tensor/test_mlp.py
-"""
-This is a minimized version of the mlp.py in the tutorial. We removed stuff
-that make this mlp don't work.  But this test a bug that we saw. This bug made
-the Shape_i object not being lifted, that caused the CrossentropySoftmax... op
-not being inserted.
-"""
-__docformat__ = "restructedtext en"
-
-from collections import OrderedDict
-
-import numpy as np
-
-import aesara
-import aesara.tensor as at
-from aesara.gradient import grad
-from aesara.tensor.math import argmax, dot, log, tanh
-from aesara.tensor.nnet.basic import CrossentropySoftmax1HotWithBiasDx, softmax
-from aesara.tensor.type import ivector, lscalar, matrix
-
-
-def gen_data():
-
-    rng = np.random.default_rng(249820)
-    # generate the dataset
-    train_set = (
-        np.asarray(rng.random((10000, 784)), dtype="float32"),
-        np.asarray(rng.random((10000,)) * 10, dtype="int64"),
-    )
-    valid_set = (
-        np.asarray(rng.random((10000, 784)), dtype="float32"),
-        np.asarray(rng.random((10000,)) * 10, dtype="int64"),
-    )
-    test_set = (
-        np.asarray(rng.random((10000, 784)), dtype="float32"),
-        np.asarray(rng.random((10000,)) * 10, dtype="int64"),
-    )
-
-    def shared_dataset(data_xy):
-        """Function that loads the dataset into shared variables
-
-        The reason we store our dataset in shared variables is to allow
-        Aesara to copy it into the GPU memory (when code is run on GPU).
-        Since copying data into the GPU is slow, copying a minibatch every time
-        is needed (the default behaviour if the data is not in a shared
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = aesara.shared(np.asarray(data_x, dtype=aesara.config.floatX))
-        shared_y = aesara.shared(np.asarray(data_y, dtype=aesara.config.floatX))
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are
-        # floats it doesn't make sense) therefore instead of returning
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, at.cast(shared_y, "int32")
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
-
-    rval = [
-        (train_set_x, train_set_y),
-        (valid_set_x, valid_set_y),
-        (test_set_x, test_set_y),
-    ]
-    return rval
-
-
-class LogisticRegression:
-    """Multi-class Logistic Regression Class
-
-    The logistic regression is fully described by a weight matrix :math:`W`
-    and bias vector :math:`b`. Classification is done by projecting data
-    points onto a set of hyperplanes, the distance to which is used to
-    determine a class membership probability.
-    """
-
-    def __init__(self, input, n_in, n_out, name_prefix=""):
-        """Initialize the parameters of the logistic regression
-
-        :type input: TensorType
-        :param input: symbolic variable that describes the input of the
-                      architecture (one minibatch)
-
-        :type n_in: int
-        :param n_in: number of input units, the dimension of the space in
-                     which the datapoints lie
-
-        :type n_out: int
-        :param n_out: number of output units, the dimension of the space in
-                      which the labels lie
-
-        """
-
-        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-        self.W = aesara.shared(
-            value=np.zeros((n_in, n_out), dtype=aesara.config.floatX),
-            name=name_prefix + "W",
-        )
-
-        # compute vector of class-membership probabilities in symbolic form
-        self.p_y_given_x = softmax(dot(input, self.W))
-
-        # compute prediction as class whose probability is maximal in
-        # symbolic form
-        self.y_pred = argmax(self.p_y_given_x, axis=1)
-
-        # parameters of the model
-        self.params = [self.W]
-
-    def negative_log_likelihood(self, y):
-        r"""Return the mean of the negative log-likelihood of the prediction
-        of this model under a given target distribution.
-
-        .. math::
-
-            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
-            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D})
-
-        :type y: TensorType
-        :param y: corresponds to a vector that gives for each example the
-                  correct label
-
-        Note: we use the mean instead of the sum so that
-              the learning rate is less dependent on the batch size
-        """
-        # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
-        # at.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
-        # at.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
-        # LP[at.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
-        # and at.mean(LP[at.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
-        # i.e., the mean log-likelihood across the minibatch.
-        return log(self.p_y_given_x[at.arange(y.shape[0]), y])
-
-
-class HiddenLayer:
-    def __init__(self, rng, input, n_in, n_out, activation=tanh, name_prefix=""):
-        """
-        Typical hidden layer of a MLP: units are fully-connected and have
-        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
-        and the bias vector b is of shape (n_out,).
-
-        NOTE : The nonlinearity used here is tanh
-
-        Hidden unit activation is given by: tanh(dot(input,W) + b)
-
-        :type rng: numpy.random.Generator
-        :param rng: a random number generator used to initialize weights
-
-        :type input: dmatrix
-        :param input: a symbolic tensor of shape (n_examples, n_in)
-
-        :type n_in: int
-        :param n_in: dimensionality of input
-
-        :type n_out: int
-        :param n_out: number of hidden units
-
-        :type activation: aesara.graph.op.Op or function
-        :param activation: Non linearity to be applied in the hidden
-                              layer
-        """
-        self.input = input
-
-        # `W` is initialized with `W_values` which is uniformly sampled
-        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
-        # the output of uniform if converted using asarray to dtype
-        # aesara.config.floatX so that the code is runable on GPU
-        W_values = np.asarray(
-            rng.uniform(
-                low=-np.sqrt(6.0 / (n_in + n_out)),
-                high=np.sqrt(6.0 / (n_in + n_out)),
-                size=(n_in, n_out),
-            ),
-            dtype=aesara.config.floatX,
-        )
-        self.W = aesara.shared(value=W_values, name=name_prefix + "W")
-
-        self.output = dot(input, self.W)
-        # parameters of the model
-        self.params = [self.W]
-
-
-class MLP:
-    """Multi-Layer Perceptron Class
-
-    A multilayer perceptron is a feedforward artificial neural network model
-    that has one layer or more of hidden units and nonlinear activations.
-    Intermediate layers usually have as activation function thanh or the
-    sigmoid function (defined here by a ``SigmoidalLayer`` class)  while the
-    top layer is a softamx layer (defined here by a ``LogisticRegression``
-    class).
-    """
-
-    def __init__(self, rng, input, n_in, n_hidden, n_out):
-        """Initialize the parameters for the multilayer perceptron
-
-        :type rng: numpy.random.Generator
-        :param rng: a random number generator used to initialize weights
-
-        :type input: TensorType
-        :param input: symbolic variable that describes the input of the
-        architecture (one minibatch)
-
-        :type n_in: int
-        :param n_in: number of input units, the dimension of the space in
-        which the datapoints lie
-
-        :type n_hidden: int
-        :param n_hidden: number of hidden units
-
-        :type n_out: int
-        :param n_out: number of output units, the dimension of the space in
-        which the labels lie
-
-        """
-
-        # Since we are dealing with a one hidden layer MLP, this will
-        # translate into a TanhLayer connected to the LogisticRegression
-        # layer; this can be replaced by a SigmoidalLayer, or a layer
-        # implementing any other nonlinearity
-        self.hiddenLayer = HiddenLayer(
-            rng=rng,
-            input=input,
-            n_in=n_in,
-            n_out=n_hidden,
-            activation=tanh,
-            name_prefix="hid_",
-        )
-
-        # The logistic regression layer gets as input the hidden units
-        # of the hidden layer
-        self.logRegressionLayer = LogisticRegression(
-            input=self.hiddenLayer.output,
-            n_in=n_hidden,
-            n_out=n_out,
-            name_prefix="log_",
-        )
-
-        # negative log likelihood of the MLP is given by the negative
-        # log likelihood of the output of the model, computed in the
-        # logistic regression layer
-        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
-
-        # the parameters of the model are the parameters of the two layer it is
-        # made out of
-        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
-
-
-def test_mlp():
-    """
-    Demonstrate stochastic gradient descent optimization for a multilayer
-    perceptron
-
-    This is demonstrated on MNIST.
-
-    :type learning_rate: float
-    :param learning_rate: learning rate used (factor for the stochastic
-    gradient
-
-    :type n_epochs: int
-    :param n_epochs: maximal number of epochs to run the optimizer
-
-    :type dataset: string
-    :param dataset: the path of the MNIST dataset file from
-                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
-
-
-    """
-    datasets = gen_data()
-
-    train_set_x, train_set_y = datasets[0]
-    valid_set_x, valid_set_y = datasets[1]
-    test_set_x, test_set_y = datasets[2]
-
-    batch_size = 100  # size of the minibatch
-
-    # compute number of minibatches for training, validation and testing
-    # n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
-    # n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
-    # n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
-
-    ######################
-    # BUILD ACTUAL MODEL #
-    ######################
-    # print '... building the model'
-
-    # allocate symbolic variables for the data
-    index = lscalar()  # index to a [mini]batch
-    x = matrix("x")  # the data is presented as rasterized images
-    y = ivector("y")  # the labels are presented as 1D vector of
-    # [int] labels
-
-    rng = np.random.default_rng(1234)
-
-    # construct the MLP class
-    classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=500, n_out=10)
-
-    # the cost we minimize during training is the negative log likelihood of
-    # the model.
-    # We take the mean of the cost over each minibatch.
-    cost = classifier.negative_log_likelihood(y).mean()
-
-    # compute the gradient of cost with respect to theta (stored in params)
-    # the resulting gradients will be stored in a list gparams
-    gparams = []
-    for param in classifier.params:
-        gparam = grad(cost, param)
-        gparams.append(gparam)
-
-    # Some optimizations needed are tagged with 'fast_run'
-    # TODO: refine that and include only those
-    mode = aesara.compile.get_default_mode().including("fast_run")
-
-    updates2 = OrderedDict()
-
-    updates2[classifier.hiddenLayer.params[0]] = grad(
-        cost, classifier.hiddenLayer.params[0]
-    )
-    train_model = aesara.function(
-        inputs=[index],
-        updates=updates2,
-        givens={
-            x: train_set_x[index * batch_size : (index + 1) * batch_size],
-            y: train_set_y[index * batch_size : (index + 1) * batch_size],
-        },
-        mode=mode,
-    )
-    # print 'MODEL 1'
-    # aesara.printing.debugprint(train_model, print_type=True)
-    assert any(
-        isinstance(i.op, CrossentropySoftmax1HotWithBiasDx)
-        for i in train_model.maker.fgraph.toposort()
-    )
-
-    # Even without FeatureShape
-    train_model = aesara.function(
-        inputs=[index],
-        updates=updates2,
-        mode=mode.excluding("ShapeOpt"),
-        givens={
-            x: train_set_x[index * batch_size : (index + 1) * batch_size],
-            y: train_set_y[index * batch_size : (index + 1) * batch_size],
-        },
-    )
-    # print
-    # print 'MODEL 2'
-    # aesara.printing.debugprint(train_model, print_type=True)
-    assert any(
-        isinstance(i.op, CrossentropySoftmax1HotWithBiasDx)
-        for i in train_model.maker.fgraph.toposort()
-    )
--- a/tests/test_rop.py
+++ b/tests/test_rop.py
@@ -25,10 +25,9 @@ from aesara.graph.basic import Apply
 from aesara.graph.op import Op
 from aesara.tensor.math import argmax, dot
 from aesara.tensor.math import max as at_max
-from aesara.tensor.nnet import conv, conv2d
 from aesara.tensor.shape import unbroadcast
 from aesara.tensor.signal.pool import Pool
-from aesara.tensor.type import TensorType, matrix, vector
+from aesara.tensor.type import matrix, vector
 from tests import unittest_tools as utt


@@ -302,62 +301,6 @@ class TestRopLop(RopLopChecker):
            v2 = scan_f()
            assert np.allclose(v1, v2), f"Rop mismatch: {v1} {v2}"

-    def test_conv(self):
-        for conv_op in [conv.conv2d, conv2d]:
-            for border_mode in ["valid", "full"]:
-                image_shape = (2, 2, 4, 5)
-                filter_shape = (2, 2, 2, 3)
-                image_dim = len(image_shape)
-                filter_dim = len(filter_shape)
-                input = TensorType(aesara.config.floatX, [False] * image_dim)(
-                    name="input"
-                )
-                filters = TensorType(aesara.config.floatX, [False] * filter_dim)(
-                    name="filter"
-                )
-                ev_input = TensorType(aesara.config.floatX, [False] * image_dim)(
-                    name="ev_input"
-                )
-                ev_filters = TensorType(aesara.config.floatX, [False] * filter_dim)(
-                    name="ev_filters"
-                )
-
-                def sym_conv2d(input, filters):
-                    return conv_op(input, filters, border_mode=border_mode)
-
-                output = sym_conv2d(input, filters).flatten()
-                yv = Rop(output, [input, filters], [ev_input, ev_filters])
-                mode = None
-                if aesara.config.mode == "FAST_COMPILE":
-                    mode = "FAST_RUN"
-                rop_f = function(
-                    [input, filters, ev_input, ev_filters],
-                    yv,
-                    on_unused_input="ignore",
-                    mode=mode,
-                )
-                sy, _ = aesara.scan(
-                    lambda i, y, x1, x2, v1, v2: (grad(y[i], x1) * v1).sum()
-                    + (grad(y[i], x2) * v2).sum(),
-                    sequences=at.arange(output.shape[0]),
-                    non_sequences=[output, input, filters, ev_input, ev_filters],
-                    mode=mode,
-                )
-                scan_f = function(
-                    [input, filters, ev_input, ev_filters],
-                    sy,
-                    on_unused_input="ignore",
-                    mode=mode,
-                )
-                dtype = aesara.config.floatX
-                image_data = np.random.random(image_shape).astype(dtype)
-                filter_data = np.random.random(filter_shape).astype(dtype)
-                ev_image_data = np.random.random(image_shape).astype(dtype)
-                ev_filter_data = np.random.random(filter_shape).astype(dtype)
-                v1 = rop_f(image_data, filter_data, ev_image_data, ev_filter_data)
-                v2 = scan_f(image_data, filter_data, ev_image_data, ev_filter_data)
-                assert np.allclose(v1, v2), f"Rop mismatch: {v1} {v2}"
-
    def test_join(self):
        tv = np.asarray(self.rng.uniform(size=(10,)), aesara.config.floatX)
        t = aesara.shared(tv)