dsi_bitstream/utils/
stats.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Inria
3 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4 * SPDX-FileCopyrightText: 2024 Tommaso Fontana
5 *
6 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
7 */
8#[cfg(feature = "mem_dbg")]
9use mem_dbg::{MemDbg, MemSize};
10
11use crate::dispatch::{CodesRead, CodesWrite};
12use crate::prelude::Endianness;
13use crate::prelude::{
14    bit_len_vbyte, len_delta, len_exp_golomb, len_gamma, len_golomb, len_omega, len_pi, len_rice,
15    len_zeta, Codes,
16};
17use crate::prelude::{DynamicCodeRead, DynamicCodeWrite, StaticCodeRead, StaticCodeWrite};
18use anyhow::Result;
19use core::fmt::Debug;
20use std::sync::Mutex;
21
22/// Keeps track of the space needed to store a stream of integers using
23/// different codes.
24///
25/// This structure can be used to determine empirically which code provides the
26/// best compression for a given stream. You have to [update the
27/// structure](Self::update) with the integers in the stream; at any time, you
28/// can examine the statistics or call [`best_code`](Self::best_code) to get the
29/// best code.
30#[derive(Debug, Copy, Clone)]
31#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
32pub struct CodesStats<
33    // How many ζ codes to consider.
34    const ZETA: usize = 10,
35    // How many Golomb codes to consider.
36    const GOLOMB: usize = 20,
37    // How many Exponential Golomb codes to consider.
38    const EXP_GOLOMB: usize = 10,
39    // How many Rice codes to consider.
40    const RICE: usize = 10,
41    // How many Pi and Pi web codes to consider.
42    const PI: usize = 10,
43> {
44    /// The total number of elements observed.
45    pub total: u64,
46    /// The total space used to store the elements if
47    /// they were stored using the unary code.
48    pub unary: u64,
49    /// The total space used to store the elements if
50    /// they were stored using the gamma code.
51    pub gamma: u64,
52    /// The total space used to store the elements if
53    /// they were stored using the delta code.
54    pub delta: u64,
55    /// The total space used to store the elements if
56    /// they were stored using the omega code.
57    pub omega: u64,
58    /// The total space used to store the elements if
59    /// they were stored using the variable byte code.
60    pub vbyte: u64,
61    /// The total space used to store the elements if
62    /// they were stored using the zeta code.
63    pub zeta: [u64; ZETA],
64    /// The total space used to store the elements if
65    /// they were stored using the Golomb code.
66    pub golomb: [u64; GOLOMB],
67    /// The total space used to store the elements if
68    /// they were stored using the exponential Golomb code.
69    pub exp_golomb: [u64; EXP_GOLOMB],
70    /// The total space used to store the elements if
71    /// they were stored using the Rice code.
72    pub rice: [u64; RICE],
73    /// The total space used to store the elements if
74    /// they were stored using the Pi code.
75    pub pi: [u64; PI],
76}
77
78impl<
79        const ZETA: usize,
80        const GOLOMB: usize,
81        const EXP_GOLOMB: usize,
82        const RICE: usize,
83        const PI: usize,
84    > core::default::Default for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
85{
86    fn default() -> Self {
87        Self {
88            total: 0,
89            unary: 0,
90            gamma: 0,
91            delta: 0,
92            omega: 0,
93            vbyte: 0,
94            zeta: [0; ZETA],
95            golomb: [0; GOLOMB],
96            exp_golomb: [0; EXP_GOLOMB],
97            rice: [0; RICE],
98            pi: [0; PI],
99        }
100    }
101}
102
103impl<
104        const ZETA: usize,
105        const GOLOMB: usize,
106        const EXP_GOLOMB: usize,
107        const RICE: usize,
108        const PI: usize,
109    > CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
110{
111    /// Update the stats with the lengths of the codes for `n` and return
112    /// `n` for convenience.
113    pub fn update(&mut self, n: u64) -> u64 {
114        self.update_many(n, 1)
115    }
116
117    #[inline]
118    pub fn update_many(&mut self, n: u64, count: u64) -> u64 {
119        self.total += count;
120        self.unary += (n + 1) * count;
121        self.gamma += len_gamma(n) as u64 * count;
122        self.delta += len_delta(n) as u64 * count;
123        self.omega += len_omega(n) as u64 * count;
124        self.vbyte += bit_len_vbyte(n) as u64 * count;
125
126        for (k, val) in self.zeta.iter_mut().enumerate() {
127            *val += (len_zeta(n, (k + 1) as _) as u64) * count;
128        }
129        for (b, val) in self.golomb.iter_mut().enumerate() {
130            *val += (len_golomb(n, (b + 1) as _) as u64) * count;
131        }
132        for (k, val) in self.exp_golomb.iter_mut().enumerate() {
133            *val += (len_exp_golomb(n, k as _) as u64) * count;
134        }
135        for (log2_b, val) in self.rice.iter_mut().enumerate() {
136            *val += (len_rice(n, log2_b as _) as u64) * count;
137        }
138        // +2 because π0 = gamma and π1 = zeta_2
139        for (k, val) in self.pi.iter_mut().enumerate() {
140            *val += (len_pi(n, (k + 2) as _) as u64) * count;
141        }
142        n
143    }
144
145    // Combines additively this stats with another one.
146    pub fn add(&mut self, rhs: &Self) {
147        self.total += rhs.total;
148        self.unary += rhs.unary;
149        self.gamma += rhs.gamma;
150        self.delta += rhs.delta;
151        self.omega += rhs.omega;
152        self.vbyte += rhs.vbyte;
153        for (a, b) in self.zeta.iter_mut().zip(rhs.zeta.iter()) {
154            *a += *b;
155        }
156        for (a, b) in self.golomb.iter_mut().zip(rhs.golomb.iter()) {
157            *a += *b;
158        }
159        for (a, b) in self.exp_golomb.iter_mut().zip(rhs.exp_golomb.iter()) {
160            *a += *b;
161        }
162        for (a, b) in self.rice.iter_mut().zip(rhs.rice.iter()) {
163            *a += *b;
164        }
165        for (a, b) in self.pi.iter_mut().zip(rhs.pi.iter()) {
166            *a += *b;
167        }
168    }
169
170    /// Return the best code for the stream and its space usage.
171    pub fn best_code(&self) -> (Codes, u64) {
172        let mut best = self.unary;
173        let mut best_code = Codes::Unary;
174
175        macro_rules! check {
176            ($code:expr, $len:expr) => {
177                if $len < best {
178                    best = $len;
179                    best_code = $code;
180                }
181            };
182        }
183
184        check!(Codes::Gamma, self.gamma);
185        check!(Codes::Delta, self.delta);
186        check!(Codes::Omega, self.omega);
187        check!(Codes::VByteBe, self.vbyte);
188
189        for (k, val) in self.zeta.iter().enumerate() {
190            check!(Codes::Zeta { k: (k + 1) as _ }, *val);
191        }
192        for (b, val) in self.golomb.iter().enumerate() {
193            check!(Codes::Golomb { b: (b + 1) as _ }, *val);
194        }
195        for (k, val) in self.exp_golomb.iter().enumerate() {
196            check!(Codes::ExpGolomb { k: k as _ }, *val);
197        }
198        for (log2_b, val) in self.rice.iter().enumerate() {
199            check!(
200                Codes::Rice {
201                    log2_b: log2_b as _
202                },
203                *val
204            );
205        }
206        for (k, val) in self.pi.iter().enumerate() {
207            check!(Codes::Pi { k: (k + 2) as _ }, *val);
208        }
209
210        (best_code, best)
211    }
212}
213
214/// Combines additively this stats with another one.
215impl<
216        const ZETA: usize,
217        const GOLOMB: usize,
218        const EXP_GOLOMB: usize,
219        const RICE: usize,
220        const PI: usize,
221    > core::ops::AddAssign for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
222{
223    fn add_assign(&mut self, rhs: Self) {
224        self.add(&rhs);
225    }
226}
227
228/// Combines additively this stats with another one creating a new one.
229impl<
230        const ZETA: usize,
231        const GOLOMB: usize,
232        const EXP_GOLOMB: usize,
233        const RICE: usize,
234        const PI: usize,
235    > core::ops::Add for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
236{
237    type Output = Self;
238
239    fn add(self, rhs: Self) -> Self {
240        let mut res = self;
241        res += rhs;
242        res
243    }
244}
245
246/// Allow to call .sum() on an iterator of CodesStats.
247impl<
248        const ZETA: usize,
249        const GOLOMB: usize,
250        const EXP_GOLOMB: usize,
251        const RICE: usize,
252        const PI: usize,
253    > core::iter::Sum for CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
254{
255    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
256        iter.fold(Self::default(), |a, b| a + b)
257    }
258}
259
260#[derive(Debug)]
261#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
262/// A struct that can wrap `Code` and compute `CodesStats` for a given stream.
263pub struct CodesStatsWrapper<
264    W,
265    // How many ζ codes to consider.
266    const ZETA: usize = 10,
267    // How many Golomb codes to consider.
268    const GOLOMB: usize = 20,
269    // How many Exponential Golomb codes to consider.
270    const EXP_GOLOMB: usize = 10,
271    // How many Rice codes to consider.
272    const RICE: usize = 10,
273    // How many Pi and Pi web codes to consider.
274    const PI: usize = 10,
275> {
276    // TODO!: figure out how we can do this without a lock.
277    // This is needed because the [`DynamicCodeRead`] and [`DynamicCodeWrite`] traits must have
278    // &self and not &mut self.
279    stats: Mutex<CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>>,
280    wrapped: W,
281}
282
283impl<
284        W,
285        const ZETA: usize,
286        const GOLOMB: usize,
287        const EXP_GOLOMB: usize,
288        const RICE: usize,
289        const PI: usize,
290    > CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
291{
292    /// Create a new `CodesStatsWrapper` with the given wrapped value.
293    pub fn new(wrapped: W) -> Self {
294        Self {
295            stats: Mutex::new(CodesStats::default()),
296            wrapped,
297        }
298    }
299
300    /// Returns a reference to the stats.
301    pub fn stats(&self) -> &Mutex<CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>> {
302        &self.stats
303    }
304
305    /// Consumes the wrapper and returns the inner wrapped value and the stats.
306    pub fn into_inner(self) -> (W, CodesStats<ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>) {
307        (self.wrapped, self.stats.into_inner().unwrap())
308    }
309}
310
311impl<
312        W: DynamicCodeRead,
313        const ZETA: usize,
314        const GOLOMB: usize,
315        const EXP_GOLOMB: usize,
316        const RICE: usize,
317        const PI: usize,
318    > DynamicCodeRead for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
319{
320    #[inline]
321    fn read<E: Endianness, CR: CodesRead<E> + ?Sized>(
322        &self,
323        reader: &mut CR,
324    ) -> Result<u64, CR::Error> {
325        let res = self.wrapped.read(reader)?;
326        self.stats.lock().unwrap().update(res);
327        Ok(res)
328    }
329}
330
331impl<
332        W: StaticCodeRead<E, CR>,
333        const ZETA: usize,
334        const GOLOMB: usize,
335        const EXP_GOLOMB: usize,
336        const RICE: usize,
337        const PI: usize,
338        E: Endianness,
339        CR: CodesRead<E> + ?Sized,
340    > StaticCodeRead<E, CR> for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
341{
342    #[inline]
343    fn read(&self, reader: &mut CR) -> Result<u64, CR::Error> {
344        let res = self.wrapped.read(reader)?;
345        self.stats.lock().unwrap().update(res);
346        Ok(res)
347    }
348}
349
350impl<
351        W: DynamicCodeWrite,
352        const ZETA: usize,
353        const GOLOMB: usize,
354        const EXP_GOLOMB: usize,
355        const RICE: usize,
356        const PI: usize,
357    > DynamicCodeWrite for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
358{
359    #[inline]
360    fn write<E: Endianness, CW: CodesWrite<E> + ?Sized>(
361        &self,
362        writer: &mut CW,
363        value: u64,
364    ) -> Result<usize, CW::Error> {
365        let res = self.wrapped.write(writer, value)?;
366        self.stats.lock().unwrap().update(value);
367        Ok(res)
368    }
369}
370
371impl<
372        W: StaticCodeWrite<E, CW>,
373        const ZETA: usize,
374        const GOLOMB: usize,
375        const EXP_GOLOMB: usize,
376        const RICE: usize,
377        const PI: usize,
378        E: Endianness,
379        CW: CodesWrite<E> + ?Sized,
380    > StaticCodeWrite<E, CW> for CodesStatsWrapper<W, ZETA, GOLOMB, EXP_GOLOMB, RICE, PI>
381{
382    #[inline]
383    fn write(&self, writer: &mut CW, value: u64) -> Result<usize, CW::Error> {
384        let res = self.wrapped.write(writer, value)?;
385        self.stats.lock().unwrap().update(value);
386        Ok(res)
387    }
388}