Implement RandomVariable Subtensor lift optimization

05051381 · Brandon T. Willard · aad5d35c · 05051381 · 05051381
--- a/tests/tensor/random/test_opt.py
+++ b/tests/tensor/random/test_opt.py
@@ -10,8 +10,20 @@ from theano.gof.graph import Constant
 from theano.gof.opt import EquilibriumOptimizer
 from theano.gof.optdb import Query
 from theano.tensor.elemwise import DimShuffle
-from theano.tensor.random.basic import dirichlet, multivariate_normal, normal, poisson
+from theano.tensor.random.basic import (
-from theano.tensor.random.opt import lift_rv_shapes, local_dimshuffle_rv_lift
+    dirichlet,
+    multivariate_normal,
+    normal,
+    poisson,
+    uniform,
+)
+from theano.tensor.random.op import RandomVariable
+from theano.tensor.random.opt import (
+    lift_rv_shapes,
+    local_dimshuffle_rv_lift,
+    local_subtensor_rv_lift,
+)
+from theano.tensor.subtensor import AdvancedSubtensor, AdvancedSubtensor1, Subtensor
 inplace_mode = Mode("py", Query(include=["random_make_inplace"], exclude=[]))
@@ -284,6 +296,187 @@ def test_DimShuffle_lift(ds_order, lifted, dist_op, dist_params, size, rtol):
    np.testing.assert_allclose(res_base, res_opt, rtol=rtol)
+@pytest.mark.parametrize(
+    "indices, lifted, dist_op, dist_params, size",
+    [
+        (
+            # `size`-less advanced boolean indexing
+            (np.r_[True, False, False, True],),
+            True,
+            uniform,
+            (
+                (0.1 - 1e-5) * np.arange(4).astype(dtype=config.floatX),
+                0.1 * np.arange(4).astype(dtype=config.floatX),
+            ),
+            (),
+        ),
+        (
+            # `size`-only advanced boolean indexing
+            (np.r_[True, False, False, True],),
+            True,
+            uniform,
+            (
+                np.array(0.9 - 1e-5, dtype=config.floatX),
+                np.array(0.9, dtype=config.floatX),
+            ),
+            (4,),
+        ),
+        (
+            # `size`-only slice
+            (slice(4, -6, -1),),
+            True,
+            uniform,
+            (
+                np.array(0.9 - 1e-5, dtype=config.floatX),
+                np.array(0.9, dtype=config.floatX),
+            ),
+            (5, 2),
+        ),
+        (
+            (slice(1, None), [0, 2]),
+            True,
+            normal,
+            (
+                np.array([1, 10, 100], dtype=config.floatX),
+                np.array([1e-5, 2e-5, 3e-5], dtype=config.floatX),
+            ),
+            (4, 3),
+        ),
+        (
+            (np.array([1]), 0),
+            True,
+            normal,
+            (
+                np.array([[-1, 20], [300, -4000]], dtype=config.floatX),
+                np.array([[1e-6, 2e-6]], dtype=config.floatX),
+            ),
+            (3, 2, 2),
+        ),
+        # A multi-dimensional case
+        (
+            (np.array([1]), 0),
+            False,
+            multivariate_normal,
+            (
+                np.array([[-1, 20], [300, -4000]], dtype=config.floatX),
+                np.eye(2).astype(config.floatX) * 1e-6,
+            ),
+            (),
+        ),
+        # Only one distribution parameter
+        (
+            (0,),
+            True,
+            poisson,
+            (np.array([[1, 2], [3, 4]], dtype=config.floatX),),
+            (3, 2, 2),
+        ),
+    ],
+)
+@change_flags(compute_test_value_opt="raise", compute_test_value="raise")
+def test_Subtensor_lift(indices, lifted, dist_op, dist_params, size):
+    rng = shared(np.random.RandomState(1233532), borrow=False)
+    dist_params_tt = []
+    for p in dist_params:
+        p_tt = tt.as_tensor(p).type()
+        p_tt.tag.test_value = p
+        dist_params_tt.append(p_tt)
+    size_tt = []
+    for s in size:
+        s_tt = tt.iscalar()
+        s_tt.tag.test_value = s
+        size_tt.append(s_tt)
+    from theano.tensor.subtensor import as_index_constant
+    indices_tt = ()
+    for i in indices:
+        i_tt = as_index_constant(i)
+        if not isinstance(i_tt, slice):
+            i_tt.tag.test_value = i
+        indices_tt += (i_tt,)
+    dist_st = dist_op(*dist_params_tt, size=size_tt, rng=rng)[indices_tt]
+    f_inputs = [
+        p
+        for p in dist_params_tt + size_tt + list(indices_tt)
+        if not isinstance(p, (slice, Constant))
+    ]
+    mode = Mode(
+        "py", EquilibriumOptimizer([local_subtensor_rv_lift], max_use_ratio=100)
+    )
+    f_opt = function(
+        f_inputs,
+        dist_st,
+        mode=mode,
+    )
+    (new_out,) = f_opt.maker.fgraph.outputs
+    if lifted:
+        assert isinstance(new_out.owner.op, RandomVariable)
+        assert all(
+            isinstance(i.owner.op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor))
+            for i in new_out.owner.inputs[3:]
+            if i.owner
+        )
+    else:
+        assert isinstance(
+            new_out.owner.op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)
+        )
+        return
+    f_base = function(
+        f_inputs,
+        dist_st,
+        mode=no_mode,
+    )
+    arg_values = [p.get_test_value() for p in f_inputs]
+    res_base = f_base(*arg_values)
+    res_opt = f_opt(*arg_values)
+    np.testing.assert_allclose(res_base, res_opt, rtol=1e-3)
+def test_Subtensor_lift_restrictions():
+    rng = shared(np.random.RandomState(1233532), borrow=False)
+    std = tt.vector("std")
+    std.tag.test_value = np.array([1e-5, 2e-5, 3e-5], dtype=config.floatX)
+    x = normal(tt.arange(2), tt.ones(2), rng=rng)
+    y = x[1]
+    # The non-`Subtensor` client depends on the RNG state, so we can't perform
+    # the lift
+    z = x - y
+    fg = FunctionGraph([rng], [z], clone=False)
+    _ = EquilibriumOptimizer([local_subtensor_rv_lift], max_use_ratio=100).apply(fg)
+    subtensor_node = fg.outputs[0].owner.inputs[1].owner.inputs[0].owner
+    assert subtensor_node == y.owner
+    assert isinstance(subtensor_node.op, Subtensor)
+    assert subtensor_node.inputs[0].owner.op == normal
+    # The non-`Subtensor` client doesn't depend on the RNG state, so we can
+    # perform the lift
+    z = tt.ones(x.shape) - x[1]
+    fg = FunctionGraph([rng], [z], clone=False)
+    EquilibriumOptimizer([local_subtensor_rv_lift], max_use_ratio=100).apply(fg)
+    rv_node = fg.outputs[0].owner.inputs[1].owner.inputs[0].owner
+    assert rv_node.op == normal
+    assert isinstance(rv_node.inputs[-1].owner.op, Subtensor)
+    assert isinstance(rv_node.inputs[-2].owner.op, Subtensor)
 def test_Dimshuffle_lift_restrictions():
    rng = shared(np.random.RandomState(1233532), borrow=False)

--- a/theano/tensor/random/opt.py
+++ b/theano/tensor/random/opt.py
@@ -9,6 +9,14 @@ from theano.tensor.extra_ops import broadcast_to
 from theano.tensor.opt import in2out
 from theano.tensor.random.op import RandomVariable
 from theano.tensor.random.utils import broadcast_params
+from theano.tensor.subtensor import (
+    AdvancedSubtensor,
+    AdvancedSubtensor1,
+    Subtensor,
+    as_index_variable,
+    get_idx_list,
+    indexed_result_shape,
+)
 @local_optimizer([RandomVariable])
@@ -197,3 +205,152 @@ def local_dimshuffle_rv_lift(fgraph, node):
        return [new_node.outputs[1]]
    return False
+@local_optimizer([Subtensor, AdvancedSubtensor1, AdvancedSubtensor])
+def local_subtensor_rv_lift(fgraph, node):
+    """Lift ``*Subtensor`` `Op`s up to a `RandomVariable`'s parameters.
+    In a fashion similar to `local_dimshuffle_rv_lift`, the indexed dimensions
+    need to be separated into distinct replication-space and (independent)
+    parameter-space ``*Subtensor``s.
+    The replication-space ``*Subtensor`` can be used to determine a
+    sub/super-set of the replication-space and, thus, a "smaller"/"larger"
+    ``size`` tuple.  The parameter-space ``*Subtensor`` is simply lifted and
+    applied to the `RandomVariable`'s distribution parameters.
+    Consider the following example graph:
+    ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``.  The
+    ``*Subtensor`` `Op` requests indices ``idx1``, ``idx2``, and ``idx3``,
+    which correspond to all three ``size`` dimensions.  Now, depending on the
+    broadcasted dimensions of ``mu`` and ``std``, this ``*Subtensor`` `Op`
+    could be reducing the ``size`` parameter and/or subsetting the independent
+    ``mu`` and ``std`` parameters.  Only once the dimensions are properly
+    separated into the two replication/parameter subspaces can we determine how
+    the ``*Subtensor`` indices are distributed.
+    For instance, ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``
+    could become ``normal(mu[idx1], std[idx2], size=np.shape(idx1) + np.shape(idx2) + np.shape(idx3))``
+    if ``mu.shape == std.shape == ()``
+    ``normal`` is a rather simple case, because it's univariate.  Multivariate
+    cases require a mapping between the parameter space and the image of the
+    random variable.  This may not always be possible, but for many common
+    distributions it is.  For example, the dimensions of the multivariate
+    normal's image can be mapped directly to each dimension of its parameters.
+    We use these mappings to change a graph like ``multivariate_normal(mu, Sigma)[idx1]``
+    into ``multivariate_normal(mu[idx1], Sigma[idx1, idx1])``.  Notice how
+    Also, there's the important matter of "advanced" indexing, which may not
+    only subset an array, but also broadcast it to a larger size.
+    """
+    st_op = node.op
+    if not isinstance(st_op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)):
+        return False
+    base_rv = node.inputs[0]
+    rv_node = base_rv.owner
+    if not (rv_node and isinstance(rv_node.op, RandomVariable)):
+        return False
+    # If no one else is using the underlying `RandomVariable`, then we can
+    # do this; otherwise, the graph would be internally inconsistent.
+    if not all(
+        (n == node or isinstance(n.op, Shape)) for n, i in fgraph.clients[base_rv]
+    ):
+        return False
+    rv_op = rv_node.op
+    rng, size, dtype, *dist_params = rv_node.inputs
+    # TODO: Remove this once the multi-dimensional changes described below are
+    # in place.
+    if rv_op.ndim_supp > 0:
+        return False
+    rv_op = base_rv.owner.op
+    rng, size, dtype, *dist_params = base_rv.owner.inputs
+    idx_list = getattr(st_op, "idx_list", None)
+    if idx_list:
+        cdata = get_idx_list(node.inputs, idx_list)
+    else:
+        cdata = node.inputs[1:]
+    st_indices, st_is_bool = zip(
+        *tuple(
+            (as_index_variable(i), getattr(i, "dtype", None) == "bool") for i in cdata
+        )
+    )
+    # We need to separate dimensions into replications and independents
+    num_ind_dims = None
+    if len(dist_params) == 1:
+        num_ind_dims = dist_params[0].ndim
+    else:
+        # When there is more than one distribution parameter, assume that all
+        # of them will broadcast to the maximum number of dimensions
+        num_ind_dims = max(d.ndim for d in dist_params)
+    reps_ind_split_idx = base_rv.ndim - (num_ind_dims + rv_op.ndim_supp)
+    if len(st_indices) > reps_ind_split_idx:
+        # These are the indices that need to be applied to the parameters
+        ind_indices = tuple(st_indices[reps_ind_split_idx:])
+        # We need to broadcast the parameters before applying the `*Subtensor*`
+        # with these indices, because the indices could be referencing broadcast
+        # dimensions that don't exist (yet)
+        bcast_dist_params = broadcast_params(dist_params, rv_op.ndims_params)
+        # TODO: For multidimensional distributions, we need a map that tells us
+        # which dimensions of the parameters need to be indexed.
+        #
+        # For example, `multivariate_normal` would have the following:
+        # `RandomVariable.param_to_image_dims = ((0,), (0, 1))`
+        #
+        # I.e. the first parameter's (i.e. mean's) first dimension maps directly to
+        # the dimension of the RV's image, and its second parameter's
+        # (i.e. covariance's) first and second dimensions map directly to the
+        # dimension of the RV's image.
+        args_lifted = tuple(p[ind_indices] for p in bcast_dist_params)
+    else:
+        # In this case, no indexing is applied to the parameters; only the
+        # `size` parameter is affected.
+        args_lifted = dist_params
+    # TODO: Could use `ShapeFeature` info.  We would need to be sure that
+    # `node` isn't in the results, though.
+    # if hasattr(fgraph, "shape_feature"):
+    #     output_shape = fgraph.shape_feature.shape_of(node.outputs[0])
+    # else:
+    output_shape = indexed_result_shape(base_rv.shape, st_indices)
+    size_lifted = (
+        output_shape if rv_op.ndim_supp == 0 else output_shape[: -rv_op.ndim_supp]
+    )
+    # Boolean indices can actually change the `size` value (compared to just
+    # *which* dimensions of `size` are used).
+    if any(st_is_bool):
+        size_lifted = tuple(
+            tt.sum(idx) if is_bool else s
+            for s, is_bool, idx in zip(
+                size_lifted, st_is_bool, st_indices[: (reps_ind_split_idx + 1)]
+            )
+        )
+    new_node = rv_op.make_node(rng, size_lifted, dtype, *args_lifted)
+    _, new_rv = new_node.outputs
+    # Calling `Op.make_node` directly circumvents test value computations, so
+    # we need to compute the test values manually
+    if config.compute_test_value != "off":
+        compute_test_value(new_node)
+    return [new_rv]