dsi_bitstream/utils/
stats.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4 * SPDX-FileCopyrightText: 2024 Tommaso Fontana
5 *
6 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
7 */
8#[cfg(feature = "mem_dbg")]
9use mem_dbg::{MemDbg, MemSize};
10
11use crate::prelude::{
12    bit_len_vbyte, len_delta, len_exp_golomb, len_gamma, len_golomb, len_omega, len_pi, len_rice,
13    len_zeta, Codes,
14};
15use core::fmt::Debug;
16
17#[cfg(feature = "std")]
18use crate::dispatch::{CodesRead, CodesWrite};
19#[cfg(feature = "std")]
20use crate::prelude::Endianness;
21#[cfg(feature = "std")]
22use crate::prelude::{DynamicCodeRead, DynamicCodeWrite, StaticCodeRead, StaticCodeWrite};
23#[cfg(feature = "std")]
24use anyhow::Result;
25#[cfg(feature = "std")]
26use std::sync::Mutex;
27
28#[cfg(feature = "alloc")]
29use alloc::vec;
30#[cfg(feature = "alloc")]
31use alloc::vec::Vec;
32
33/// Keeps track of the space needed to store a stream of integers using
34/// different codes.
35///
36/// This structure can be used to determine empirically which code provides the
37/// best compression for a given stream. You have to [update the
38/// structure](Self::update) with the integers in the stream; at any time, you
39/// can examine the statistics or call [`best_code`](Self::best_code) to get the
40/// best code.
41#[derive(Debug, Copy, Clone)]
42#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
43pub struct CodesStats<
44    // How many ζ codes to consider.
45    const ZETA: usize = 10,
46    // How many Golomb codes to consider.
47    const GOLOMB: usize = 20,
48    // How many Exponential Golomb codes to consider.
49    const EXP_GOLOMB: usize = 10,
50    // How many Rice codes to consider.
51    const RICE: usize = 10,
52    // How many Pi and Pi web codes to consider.
53    const PI: usize = 10,
54> {
55    /// The total number of elements observed.
56    pub total: u64,
57    /// The total space used to store the elements if
58    /// they were stored using the unary code.
59    pub unary: u64,
60    /// The total space used to store the elements if
61    /// they were stored using the gamma code.
62    pub gamma: u64,
63    /// The total space used to store the elements if
64    /// they were stored using the delta code.
65    pub delta: u64,
66    /// The total space used to store the elements if
67    /// they were stored using the omega code.
68    pub omega: u64,
69    /// The total space used to store the elements if
70    /// they were stored using the variable byte code.
71    pub vbyte: u64,
72    /// The total space used to store the elements if
73    /// they were stored using the zeta code.
74    pub zeta: [u64; ZETA],
75    /// The total space used to store the elements if
76    /// they were stored using the Golomb code.
77    pub golomb: [u64; GOLOMB],
78    /// The total space used to store the elements if
79    /// they were stored using the exponential Golomb code.
80    pub exp_golomb: [u64; EXP_GOLOMB],
81    /// The total space used to store the elements if
82    /// they were stored using the Rice code.
83    pub rice: [u64; RICE],
84    /// The total space used to store the elements if
85    /// they were stored using the Pi code.
86    pub pi: [u64; PI],
87}
88
89impl<
90        const ZETA: usize,
91        const GOLOMB: usize,
92        const EXP_GOLOMB: usize,
93        const RICE: usize,
94        const PI: usize,
95    > core::default::Default for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
96{
97    fn default() -> Self {
98        Self {
99            total: 0,
100            unary: 0,
101            gamma: 0,
102            delta: 0,
103            omega: 0,
104            vbyte: 0,
105            zeta: [0; ZETA],
106            golomb: [0; GOLOMB],
107            exp_golomb: [0; EXP_GOLOMB],
108            rice: [0; RICE],
109            pi: [0; PI],
110        }
111    }
112}
113
114impl<
115        const ZETA: usize,
116        const GOLOMB: usize,
117        const EXP_GOLOMB: usize,
118        const RICE: usize,
119        const PI: usize,
120    > CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
121{
122    /// Update the stats with the lengths of the codes for `n` and return
123    /// `n` for convenience.
124    pub fn update(&mut self, n: u64) -> u64 {
125        self.update_many(n, 1)
126    }
127
128    #[inline]
129    pub fn update_many(&mut self, n: u64, count: u64) -> u64 {
130        self.total += count;
131        self.unary += (n + 1) * count;
132        self.gamma += len_gamma(n) as u64 * count;
133        self.delta += len_delta(n) as u64 * count;
134        self.omega += len_omega(n) as u64 * count;
135        self.vbyte += bit_len_vbyte(n) as u64 * count;
136
137        for (k, val) in self.zeta.iter_mut().enumerate() {
138            *val += (len_zeta(n, (k + 1) as _) as u64) * count;
139        }
140        for (b, val) in self.golomb.iter_mut().enumerate() {
141            *val += (len_golomb(n, (b + 1) as _) as u64) * count;
142        }
143        for (k, val) in self.exp_golomb.iter_mut().enumerate() {
144            *val += (len_exp_golomb(n, k as _) as u64) * count;
145        }
146        for (log2_b, val) in self.rice.iter_mut().enumerate() {
147            *val += (len_rice(n, log2_b as _) as u64) * count;
148        }
149        // +2 because π0 = gamma and π1 = zeta_2
150        for (k, val) in self.pi.iter_mut().enumerate() {
151            *val += (len_pi(n, (k + 2) as _) as u64) * count;
152        }
153        n
154    }
155
156    // Combines additively this stats with another one.
157    pub fn add(&mut self, rhs: &Self) {
158        self.total += rhs.total;
159        self.unary += rhs.unary;
160        self.gamma += rhs.gamma;
161        self.delta += rhs.delta;
162        self.omega += rhs.omega;
163        self.vbyte += rhs.vbyte;
164        for (a, b) in self.zeta.iter_mut().zip(rhs.zeta.iter()) {
165            *a += *b;
166        }
167        for (a, b) in self.golomb.iter_mut().zip(rhs.golomb.iter()) {
168            *a += *b;
169        }
170        for (a, b) in self.exp_golomb.iter_mut().zip(rhs.exp_golomb.iter()) {
171            *a += *b;
172        }
173        for (a, b) in self.rice.iter_mut().zip(rhs.rice.iter()) {
174            *a += *b;
175        }
176        for (a, b) in self.pi.iter_mut().zip(rhs.pi.iter()) {
177            *a += *b;
178        }
179    }
180
181    /// Return the best code for the stream and its space usage.
182    pub fn best_code(&self) -> (Codes, u64) {
183        let mut best = (Codes::Unary, self.unary);
184        if self.gamma < best.1 {
185            best = (Codes::Gamma, self.gamma);
186        }
187        if self.delta < best.1 {
188            best = (Codes::Delta, self.delta);
189        }
190        if self.omega < best.1 {
191            best = (Codes::Omega, self.omega);
192        }
193        if self.vbyte < best.1 {
194            best = (Codes::VByteBe, self.vbyte);
195        }
196        for (k, val) in self.zeta.iter().enumerate() {
197            if *val < best.1 {
198                best = (Codes::Zeta { k: (k + 1) as _ }, *val);
199            }
200        }
201        for (b, val) in self.golomb.iter().enumerate() {
202            if *val < best.1 {
203                best = (Codes::Golomb { b: (b + 1) as _ }, *val);
204            }
205        }
206        for (k, val) in self.exp_golomb.iter().enumerate() {
207            if *val < best.1 {
208                best = (Codes::ExpGolomb { k: k as _ }, *val);
209            }
210        }
211        for (log2_b, val) in self.rice.iter().enumerate() {
212            if *val < best.1 {
213                best = (
214                    Codes::Rice {
215                        log2_b: log2_b as _,
216                    },
217                    *val,
218                );
219            }
220        }
221        for (k, val) in self.pi.iter().enumerate() {
222            if *val < best.1 {
223                best = (Codes::Pi { k: (k + 2) as _ }, *val);
224            }
225        }
226        best
227    }
228
229    /// Returns a vector of all codes and their space usage, in ascending order by space usage.
230    #[cfg(feature = "alloc")]
231    pub fn get_codes(&self) -> Vec<(Codes, u64)> {
232        let mut codes = vec![
233            (Codes::Unary, self.unary),
234            (Codes::Gamma, self.gamma),
235            (Codes::Delta, self.delta),
236            (Codes::Omega, self.omega),
237            (Codes::VByteBe, self.vbyte),
238        ];
239        for (k, val) in self.zeta.iter().enumerate() {
240            codes.push((Codes::Zeta { k: (k + 1) as _ }, *val));
241        }
242        for (b, val) in self.golomb.iter().enumerate() {
243            codes.push((Codes::Golomb { b: (b + 1) as _ }, *val));
244        }
245        for (k, val) in self.exp_golomb.iter().enumerate() {
246            codes.push((Codes::ExpGolomb { k: k as _ }, *val));
247        }
248        for (log2_b, val) in self.rice.iter().enumerate() {
249            codes.push((
250                Codes::Rice {
251                    log2_b: log2_b as _,
252                },
253                *val,
254            ));
255        }
256        for (k, val) in self.pi.iter().enumerate() {
257            codes.push((Codes::Pi { k: (k + 2) as _ }, *val));
258        }
259        // sort them by length
260        codes.sort_by_key(|&(_, len)| len);
261        codes
262    }
263}
264
265/// Combines additively this stats with another one.
266impl<
267        const ZETA: usize,
268        const GOLOMB: usize,
269        const EXP_GOLOMB: usize,
270        const RICE: usize,
271        const PI: usize,
272    > core::ops::AddAssign for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
273{
274    fn add_assign(&mut self, rhs: Self) {
275        self.add(&rhs);
276    }
277}
278
279/// Combines additively this stats with another one creating a new one.
280impl<
281        const ZETA: usize,
282        const GOLOMB: usize,
283        const EXP_GOLOMB: usize,
284        const RICE: usize,
285        const PI: usize,
286    > core::ops::Add for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
287{
288    type Output = Self;
289
290    fn add(self, rhs: Self) -> Self {
291        let mut res = self;
292        res += rhs;
293        res
294    }
295}
296
297/// Allow to call .sum() on an iterator of CodesStats.
298impl<
299        const ZETA: usize,
300        const GOLOMB: usize,
301        const EXP_GOLOMB: usize,
302        const RICE: usize,
303        const PI: usize,
304    > core::iter::Sum for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
305{
306    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
307        iter.fold(Self::default(), |a, b| a + b)
308    }
309}
310
311#[derive(Debug)]
312#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
313#[cfg(feature = "std")]
314/// A struct that can wrap `Code` and compute `CodesStats` for a given stream.
315pub struct CodesStatsWrapper<
316    W,
317    // How many ζ codes to consider.
318    const ZETA: usize = 10,
319    // How many Golomb codes to consider.
320    const GOLOMB: usize = 20,
321    // How many Exponential Golomb codes to consider.
322    const EXP_GOLOMB: usize = 10,
323    // How many Rice codes to consider.
324    const RICE: usize = 10,
325    // How many Pi and Pi web codes to consider.
326    const PI: usize = 10,
327> {
328    // TODO!: figure out how we can do this without a lock.
329    // This is needed because the [`DynamicCodeRead`] and [`DynamicCodeWrite`] traits must have
330    // &self and not &mut self.
331    stats: Mutex<CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>>,
332    wrapped: W,
333}
334
335#[cfg(feature = "std")]
336impl<
337        W,
338        const ZETA: usize,
339        const GOLOMB: usize,
340        const EXP_GOLOMB: usize,
341        const RICE: usize,
342        const PI: usize,
343    > CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
344{
345    /// Create a new `CodesStatsWrapper` with the given wrapped value.
346    pub fn new(wrapped: W) -> Self {
347        Self {
348            stats: Mutex::new(CodesStats::default()),
349            wrapped,
350        }
351    }
352
353    /// Returns a reference to the stats.
354    pub fn stats(&self) -> &Mutex<CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>> {
355        &self.stats
356    }
357
358    /// Consumes the wrapper and returns the inner wrapped value and the stats.
359    pub fn into_inner(self) -> (W, CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>) {
360        (self.wrapped, self.stats.into_inner().unwrap())
361    }
362}
363
364#[cfg(feature = "std")]
365impl<
366        W: DynamicCodeRead,
367        const ZETA: usize,
368        const GOLOMB: usize,
369        const EXP_GOLOMB: usize,
370        const RICE: usize,
371        const PI: usize,
372    > DynamicCodeRead for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
373{
374    #[inline]
375    fn read<E: Endianness, CR: CodesRead<E> + ?Sized>(
376        &self,
377        reader: &mut CR,
378    ) -> Result<u64, CR::Error> {
379        let res = self.wrapped.read(reader)?;
380        self.stats.lock().unwrap().update(res);
381        Ok(res)
382    }
383}
384
385#[cfg(feature = "std")]
386impl<
387        W: StaticCodeRead<E, CR>,
388        const ZETA: usize,
389        const GOLOMB: usize,
390        const EXP_GOLOMB: usize,
391        const RICE: usize,
392        const PI: usize,
393        E: Endianness,
394        CR: CodesRead<E> + ?Sized,
395    > StaticCodeRead<E, CR> for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
396{
397    #[inline]
398    fn read(&self, reader: &mut CR) -> Result<u64, CR::Error> {
399        let res = self.wrapped.read(reader)?;
400        self.stats.lock().unwrap().update(res);
401        Ok(res)
402    }
403}
404
405#[cfg(feature = "std")]
406impl<
407        W: DynamicCodeWrite,
408        const ZETA: usize,
409        const GOLOMB: usize,
410        const EXP_GOLOMB: usize,
411        const RICE: usize,
412        const PI: usize,
413    > DynamicCodeWrite for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
414{
415    #[inline]
416    fn write<E: Endianness, CW: CodesWrite<E> + ?Sized>(
417        &self,
418        writer: &mut CW,
419        value: u64,
420    ) -> Result<usize, CW::Error> {
421        let res = self.wrapped.write(writer, value)?;
422        self.stats.lock().unwrap().update(value);
423        Ok(res)
424    }
425}
426
427#[cfg(feature = "std")]
428impl<
429        W: StaticCodeWrite<E, CW>,
430        const ZETA: usize,
431        const GOLOMB: usize,
432        const EXP_GOLOMB: usize,
433        const RICE: usize,
434        const PI: usize,
435        E: Endianness,
436        CW: CodesWrite<E> + ?Sized,
437    > StaticCodeWrite<E, CW> for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
438{
439    #[inline]
440    fn write(&self, writer: &mut CW, value: u64) -> Result<usize, CW::Error> {
441        let res = self.wrapped.write(writer, value)?;
442        self.stats.lock().unwrap().update(value);
443        Ok(res)
444    }
445}