dsi_bitstream/utils/
implied.rs

1/*
2 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
5 */
6
7#[cfg(feature = "alloc")]
8use crate::utils::FindChangePoints;
9#[cfg(feature = "alloc")]
10use rand::distr::weighted::WeightedIndex;
11#[cfg(feature = "alloc")]
12use rand::prelude::*;
13
14#[cfg(feature = "alloc")]
15use alloc::vec::Vec;
16
17/// Given the len function of a code, generates data that allows to sample
18/// its implied distribution, i.e. a code-word with length l has
19/// probability 2^(-l).
20///
21/// This code works only with monotonic non decreasing len functions.
22///
23/// Returns two vectors, the first one contains the input values where the
24/// function changes value and the code length at that point. The second
25/// vector contains the probability of each code length.
26///
27/// Since we cannot write more than 64 bits at once, the codes are limited to
28/// 128 bits.
29#[cfg(feature = "alloc")]
30pub fn get_implied_distribution(f: impl Fn(u64) -> usize) -> (Vec<(u64, usize)>, Vec<f64>) {
31    let change_points = FindChangePoints::new(f)
32        .take_while(|(_input, len)| *len <= 128)
33        .collect::<Vec<_>>();
34
35    // convert to len probabilities
36    let probabilities = change_points
37        .windows(2)
38        .map(|window| {
39            let (input, len) = window[0];
40            let (next_input, _next_len) = window[1];
41            let prob = 2.0_f64.powi(-(len as i32));
42            prob * (next_input - input) as f64
43        })
44        .collect::<Vec<_>>();
45    // TODO!: this ignores the last change point
46
47    (change_points, probabilities)
48}
49
50#[derive(Clone, Copy, Debug)]
51/// An infinite iterator that always returns ().
52pub struct InfiniteIterator;
53
54impl Iterator for InfiniteIterator {
55    type Item = ();
56
57    #[inline(always)]
58    fn next(&mut self) -> Option<Self::Item> {
59        Some(())
60    }
61}
62
63/// Returns an **infinite iterator** of samples from the implied distribution of
64/// the given code length function.
65/// The function f should be the len function of the code.
66///
67/// This code works only with monotonic non decreasing len functions and
68/// the codes are limited to 128 bits as we cannot write more than 64 bits at once.
69///
70/// # Example
71///
72/// ```rust
73/// use dsi_bitstream::utils::sample_implied_distribution;
74/// use dsi_bitstream::codes::len_gamma;
75/// use rand::SeedableRng;
76/// use rand::rngs::SmallRng;
77///
78/// let mut rng = SmallRng::seed_from_u64(42);
79/// let vals: Vec<u64> = sample_implied_distribution(len_gamma, &mut rng)
80///     .take(1000).collect::<Vec<_>>();
81///
82/// assert_eq!(vals.len(), 1000);
83/// ```
84#[cfg(feature = "alloc")]
85pub fn sample_implied_distribution(
86    f: impl Fn(u64) -> usize,
87    rng: &mut impl Rng,
88) -> impl Iterator<Item = u64> + '_ {
89    let (change_points, probabilities) = get_implied_distribution(f);
90    let dist = WeightedIndex::new(probabilities).unwrap();
91
92    InfiniteIterator.map(move |_| {
93        // sample a len with the correct probability
94        let idx = dist.sample(rng);
95        // now we sample a random value with the sampled len
96        let (start_input, _len) = change_points[idx];
97        let (end_input, _len) = change_points[idx + 1];
98        rng.random_range(start_input..end_input)
99    })
100}