compressed_intvec/variable/
codec.rs

1//! Codec selection for variable-length integer compression.
2//!
3//! This module defines [`VariableCodecSpec`], an enum for controlling the
4//! compression strategy of an [`IntVec`]. The choice of codec is a critical
5//! performance parameter, as its effectiveness depends on the statistical
6//! properties of the data being compressed.
7//!
8//! # Codec Selection Strategy
9//!
10//! Codec selection is performed by a statistical analysis of the entire
11//! input dataset at construction time. 
12//!
13//! The [`VariableCodecSpec`] enum provides several ways to specify the compression method:
14//!
15//! 1.  **Explicit Specification**: A specific codec and all its parameters are
16//!     provided. This is suitable when the data characteristics are known in
17//!     advance.
18//!     - Non-parametric examples: [`Gamma`](VariableCodecSpec::Gamma), [`Delta`](VariableCodecSpec::Delta).
19//!     - Parametric example: `Zeta { k: Some(3) }`.
20//! 
21//! ```
22//! use compressed_intvec::prelude::*;
23//! 
24//! let data: &[u32] = &(0..1000).collect::<Vec<_>>(); 
25//! 
26//! // Explicitly specify a non-parametric codec
27//! let delta_vec: UIntVec<u32> = IntVec::builder()
28//!     .codec(VariableCodecSpec::Delta)
29//!     .k(16)
30//!     .build(&data)
31//!     .unwrap();
32//!  
33//! // Explicitly specify a parametric codec with a fixed parameter
34//! let zeta_vec: UIntVec<u32> = IntVec::builder()
35//!     .codec(VariableCodecSpec::Zeta { k: Some(3) })
36//!     .build(&data)
37//!     .unwrap();
38//! ```
39//!
40//! 2.  **Automatic Parameter Estimation**: A specific codec family is chosen, but
41//!     the optimal parameter is determined by the builder based on a full data
42//!     analysis. This is achieved by providing `None` as the parameter value.
43//!     - Example: `Rice { log2_b: None }` will find the best `log2_b` for the
44//!       given data.
45//! 
46//! ```
47//! use compressed_intvec::prelude::*;
48//!
49//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
50//!
51//! // Automatically select the best Rice parameter
52//! let rice_vec: UIntVec<u32> = IntVec::builder()
53//!     .codec(VariableCodecSpec::Rice { log2_b: None })
54//!     .build(&data)
55//!     .unwrap();
56//! ```
57//!
58//! 3.  **Fully Automatic Selection**: The builder analyzes the data against all
59//!     available codecs and their standard parameter ranges to find the single
60//!     best configuration. This is activated by using [`VariableCodecSpec::Auto`].
61//! 
62//! ```
63//! use compressed_intvec::prelude::*;
64//! 
65//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
66//! // Automatically select the best codec and parameters for the data
67//! let auto_vec: UIntVec<u32> = IntVec::builder()
68//!    .codec(VariableCodecSpec::Auto)
69//!    .build(&data)
70//!    .unwrap();
71//! ```
72//! 
73//!
74//! ## Analysis Mechanism
75//!
76//! The selection logic uses the [`CodesStats`] utility from the [`dsi-bitstream`]
77//! crate. For a given sequence of integers, [`CodesStats`] calculates the exact
78//! total bit cost for encoding the sequence with a wide range of instantaneous
79//! codes and their common parameterizations. 
80//!
81//! ## Construction Overhead
82//!
83//! The full-dataset analysis has a one-time computational cost at construction.
84//! The complexity is `O(N * C)`, where `N` is the number of elements in the
85//! input and `C` is the number of codec configurations tested by [`CodesStats`]
86//! (approximately 70).
87//!
88//! This trade-off is suitable for read-heavy workloads where a higher initial
89//! cost is acceptable for better compression and subsequent read performance.
90//!
91//! # Implementation Notes
92//!
93//! - The parameter ranges for codecs like Zeta and Rice are defined by the `const
94//!   generics` of the [`CodesStats`] struct in [`dsi-bitstream`]. The default
95//!   values cover common and effective parameter ranges.
96//! - If a data distribution benefits from a parameter outside of the tested
97//!   range (e.g., Zeta with `k=20`), it must be specified explicitly in the
98//!   builder via `.codec(VariableCodecSpec::Zeta { k: Some(20) })`.
99//!
100//! [`IntVec`]: crate::variable::IntVec
101//! [`IntVecBuilder`]: crate::variable::builder::IntVecBuilder
102//! [`dsi-bitstream`]: https://crates.io/crates/dsi-bitstream
103
104use super::IntVecError;
105use dsi_bitstream::prelude::{Codes, CodesStats};
106
107/// Specifies the compression codec and its parameters for an [`IntVec`](super::IntVec).
108///
109/// This enum allows for either explicitly setting the parameters for codes
110/// like Rice and Zeta, or requesting that the [`IntVecBuilder`](super::builder::IntVecBuilder)
111/// automatically select suitable parameters by performing a full analysis of
112/// the data distribution.
113#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
114pub enum VariableCodecSpec {
115    /// Elias γ-coding. A simple, universal code that is effective for integers
116    /// with a distribution skewed towards small values. It is the default codec
117    /// for the iterator-based builder, which cannot perform data analysis.
118    #[default]
119    Gamma,
120
121    /// Elias δ-coding. A universal code that is generally more efficient than
122    /// Gamma for larger integer values.
123    Delta,
124
125    /// Unary coding. Encodes an integer `n` as `n` zeros followed by a one. It is
126    /// only efficient for extremely small values (e.g., 0, 1, 2).
127    Unary,
128
129    /// Rice-coding with a parameter `log2_b`. This code is optimal for data with
130    /// a geometric distribution.
131    ///
132    /// - If `log2_b` is `Some(val)`, the specified parameter is used.
133    /// - If `log2_b` is `None`, an optimal parameter is estimated by analyzing the
134    ///   entire dataset.
135    Rice { log2_b: Option<u8> },
136
137    /// Boldi-Vigna ζ-coding with a parameter `k`. This code is effective for
138    /// data with a power-law distribution, common in web graphs and social networks.
139    ///
140    /// - If `k` is `Some(val)`, the specified parameter is used (`k > 0`).
141    /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
142    ///   entire dataset.
143    Zeta { k: Option<u64> },
144
145    /// Golomb-coding with a parameter `b`. This is a generalization of Rice coding
146    /// and is also suitable for geometric distributions.
147    ///
148    /// - If `b` is `Some(val)`, the specified parameter is used (`b > 0`).
149    /// - If `b` is `None`, an optimal parameter is estimated by analyzing the
150    ///   entire dataset.
151    Golomb { b: Option<u64> },
152
153    /// Elias-Fano ω-coding. A universal code.
154    Omega,
155
156    /// Streamlined Apostolico–Drovandi π code with a parameter `k`.
157    ///
158    /// - If `k` is `Some(val)`, the specified parameter is used (`k > 0`).
159    /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
160    ///   entire dataset.
161    Pi { k: Option<u64> },
162
163    /// Elias-Fano Exponential-Golomb coding with a parameter `k`.
164    ///
165    /// - If `k` is `Some(val)`, the specified parameter is used.
166    /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
167    ///   entire dataset.
168    ExpGolomb { k: Option<u64> },
169
170    /// VByte encoding with Little-Endian byte order. This is often one of the
171    /// fastest codecs for decoding, though it may not offer the best compression.
172    VByteLe,
173
174    /// VByte encoding with Big-Endian byte order.
175    VByteBe,
176
177    /// Automatically select the best variable-length code based on the data.
178    ///
179    /// When this option is used, the builder performs a statistical
180    /// analysis on the entire input dataset to determine which codec and
181    /// parameterization provides the best compression ratio.
182    ///
183    /// # Note
184    /// 
185    /// This option is **not** supported for the iterator-based builder,
186    /// as it requires pre-analyzing the data.
187    Auto,
188
189    /// Use an explicitly provided [`Codes`] variant from [`dsi-bitstream`](https://crates.io/crates/dsi-bitstream).
190    ///
191    /// This is for advanced use cases where the user has already constructed
192    /// a [`Codes`] enum instance.
193    Explicit(Codes),
194}
195
196/// Resolves a user-provided [`VariableCodecSpec`] into a concrete [`Codes`] variant.
197///
198/// This function translates the user's high-level request into a fully-parameterized,
199/// concrete [`Codes`] variant that can be used for compression.
200///
201/// If the `spec` includes requests for automatic parameter selection (e.g., `Auto`
202/// or `Zeta { k: None }`), this function analyzes the **entire** provided `input`
203/// data slice to determine the optimal settings.
204pub(crate) fn resolve_codec<U>(input: &[U], spec: VariableCodecSpec) -> Result<Codes, IntVecError>
205where
206    U: Into<u64> + Copy,
207{
208    // For an empty input, there's nothing to analyze. Return a safe default.
209    if input.is_empty() {
210        return Ok(Codes::Gamma);
211    }
212
213    match spec {
214        // Parameter-free codecs are a direct mapping.
215        VariableCodecSpec::Gamma => Ok(Codes::Gamma),
216        VariableCodecSpec::Delta => Ok(Codes::Delta),
217        VariableCodecSpec::Unary => Ok(Codes::Unary),
218        VariableCodecSpec::Omega => Ok(Codes::Omega),
219        VariableCodecSpec::VByteLe => Ok(Codes::VByteLe),
220        VariableCodecSpec::VByteBe => Ok(Codes::VByteBe),
221        VariableCodecSpec::Explicit(codes) => Ok(codes),
222
223        // Codecs with optional, user-provided parameters.
224        VariableCodecSpec::Rice { log2_b: Some(p) } => Ok(Codes::Rice { log2_b: p as usize }),
225        VariableCodecSpec::Zeta { k: Some(p) } => Ok(Codes::Zeta { k: p as usize }),
226        VariableCodecSpec::Golomb { b: Some(p) } => Ok(Codes::Golomb { b: p as usize }),
227        VariableCodecSpec::Pi { k: Some(p) } => Ok(Codes::Pi { k: p as usize }),
228        VariableCodecSpec::ExpGolomb { k: Some(p) } => Ok(Codes::ExpGolomb { k: p as usize }),
229
230        // Codecs that require data analysis. We group them to compute stats only once.
231        VariableCodecSpec::Auto
232        | VariableCodecSpec::Rice { log2_b: None }
233        | VariableCodecSpec::Zeta { k: None }
234        | VariableCodecSpec::Golomb { b: None }
235        | VariableCodecSpec::Pi { k: None }
236        | VariableCodecSpec::ExpGolomb { k: None } => {
237            // Define a type alias for the default [`CodesStats`] configuration for clarity.
238            // These const generics define the range of parameters to test.
239            type DefaultCodesStats = CodesStats<10, 20, 10, 10, 10>;
240
241            // Create a stats object and populate it by iterating through the entire dataset.
242            let mut stats = DefaultCodesStats::default();
243            for &value in input {
244                stats.update(value.into());
245            }
246
247            // Use the populated stats to resolve the codec specification.
248            match spec {
249                VariableCodecSpec::Auto => {
250                    let (best_code, _) = stats.best_code();
251                    Ok(best_code)
252                }
253                VariableCodecSpec::Rice { log2_b: None } => {
254                    let (best_param, _) = stats
255                        .rice
256                        .iter()
257                        .enumerate()
258                        .min_by_key(|&(_, cost)| cost)
259                        .unwrap_or((0, &0)); // Fallback to 0 if array is empty.
260                    Ok(Codes::Rice { log2_b: best_param })
261                }
262                VariableCodecSpec::Zeta { k: None } => {
263                    let (best_param, _) = stats
264                        .zeta
265                        .iter()
266                        .enumerate()
267                        .min_by_key(|&(_, cost)| cost)
268                        .unwrap_or((0, &0));
269                    Ok(Codes::Zeta { k: best_param + 1 }) // Zeta params are 1-based.
270                }
271                VariableCodecSpec::Golomb { b: None } => {
272                    let (best_param, _) = stats
273                        .golomb
274                        .iter()
275                        .enumerate()
276                        .min_by_key(|&(_, cost)| cost)
277                        .unwrap_or((0, &0));
278                    Ok(Codes::Golomb { b: best_param + 1 }) // Golomb params are 1-based.
279                }
280                VariableCodecSpec::Pi { k: None } => {
281                    let (best_param, _) = stats
282                        .pi
283                        .iter()
284                        .enumerate()
285                        .min_by_key(|&(_, cost)| cost)
286                        .unwrap_or((0, &0));
287                    Ok(Codes::Pi { k: best_param + 2 }) // Pi params are offset by 2.
288                }
289                VariableCodecSpec::ExpGolomb { k: None } => {
290                    let (best_param, _) = stats
291                        .exp_golomb
292                        .iter()
293                        .enumerate()
294                        .min_by_key(|&(_, cost)| cost)
295                        .unwrap_or((0, &0));
296                    Ok(Codes::ExpGolomb { k: best_param })
297                }
298                // This arm is guaranteed to be unreachable because the outer match
299                // ensures `spec` is one of the variants handled above.
300                _ => unreachable!(),
301            }
302        }
303    }
304}