dsi_bitstream/utils/
implied.rs

1/*
2 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
5 */
6
7use crate::utils::FindChangePoints;
8
9use rand::distr::weighted::WeightedIndex;
10use rand::prelude::*;
11
12/// Given the len function of a code, generates data that allows to sample
13/// its implied distribution, i.e. a code-word with length l has
14/// probability 2^(-l).
15///
16/// This code works only with monotonic non decreasing len functions.
17///
18/// Returns two vectors, the first one contains the input values where the
19/// function changes value and the code length at that point. The second
20/// vector contains the probability of each code length.
21///
22/// Since we cannot write more than 64 bits at once, the codes are limited to
23/// 128 bits.
24pub fn get_implied_distribution(f: impl Fn(u64) -> usize) -> (Vec<(u64, usize)>, Vec<f64>) {
25    let change_points = FindChangePoints::new(f)
26        .take_while(|(_input, len)| *len <= 128)
27        .collect::<Vec<_>>();
28
29    // convert to len probabilities
30    let probabilities = change_points
31        .windows(2)
32        .map(|window| {
33            let (input, len) = window[0];
34            let (next_input, _next_len) = window[1];
35            let prob = 2.0_f64.powi(-(len as i32));
36            prob * (next_input - input) as f64
37        })
38        .collect::<Vec<_>>();
39    // TODO!: this ignores the last change point
40
41    (change_points, probabilities)
42}
43
44#[derive(Clone, Copy, Debug)]
45/// An infinite iterator that always returns ().
46pub struct InfiniteIterator;
47
48impl Iterator for InfiniteIterator {
49    type Item = ();
50
51    #[inline(always)]
52    fn next(&mut self) -> Option<Self::Item> {
53        Some(())
54    }
55}
56
57/// Returns an **infinite iterator** of samples from the implied distribution of
58/// the given code length function.
59/// The function f should be the len function of the code.
60///
61/// This code works only with monotonic non decreasing len functions and
62/// the codes are limited to 128 bits as we cannot write more than 64 bits at once.
63///
64/// # Example
65///
66/// ```rust
67/// use dsi_bitstream::utils::sample_implied_distribution;
68/// use dsi_bitstream::codes::len_gamma;
69/// use rand::SeedableRng;
70/// use rand::rngs::SmallRng;
71///
72/// let mut rng = SmallRng::seed_from_u64(42);
73/// let vals: Vec<u64> = sample_implied_distribution(len_gamma, &mut rng)
74///     .take(1000).collect::<Vec<_>>();
75///
76/// assert_eq!(vals.len(), 1000);
77/// ```
78pub fn sample_implied_distribution(
79    f: impl Fn(u64) -> usize,
80    rng: &mut impl Rng,
81) -> impl Iterator<Item = u64> + '_ {
82    let (change_points, probabilities) = get_implied_distribution(f);
83    let dist = WeightedIndex::new(probabilities).unwrap();
84
85    InfiniteIterator.map(move |_| {
86        // sample a len with the correct probability
87        let idx = dist.sample(rng);
88        // now we sample a random value with the sampled len
89        let (start_input, _len) = change_points[idx];
90        let (end_input, _len) = change_points[idx + 1];
91        rng.random_range(start_input..end_input)
92    })
93}