compressed_intvec/variable/codec.rs
1//! Codec selection for variable-length integer compression.
2//!
3//! This module defines [`VariableCodecSpec`], an enum for controlling the
4//! compression strategy of an [`IntVec`]. The choice of codec is a critical
5//! performance parameter, as its effectiveness depends on the statistical
6//! properties of the data being compressed.
7//!
8//! # Codec Selection Strategy
9//!
10//! Codec selection is performed by a statistical analysis of the entire
11//! input dataset at construction time.
12//!
13//! The [`VariableCodecSpec`] enum provides several ways to specify the compression method:
14//!
15//! 1. **Explicit Specification**: A specific codec and all its parameters are
16//! provided. This is suitable when the data characteristics are known in
17//! advance.
18//! - Non-parametric examples: [`Gamma`](VariableCodecSpec::Gamma), [`Delta`](VariableCodecSpec::Delta).
19//! - Parametric example: `Zeta { k: Some(3) }`.
20//!
21//! ```
22//! use compressed_intvec::prelude::*;
23//!
24//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
25//!
26//! // Explicitly specify a non-parametric codec
27//! let delta_vec: UIntVec<u32> = IntVec::builder()
28//! .codec(VariableCodecSpec::Delta)
29//! .k(16)
30//! .build(&data)
31//! .unwrap();
32//!
33//! // Explicitly specify a parametric codec with a fixed parameter
34//! let zeta_vec: UIntVec<u32> = IntVec::builder()
35//! .codec(VariableCodecSpec::Zeta { k: Some(3) })
36//! .build(&data)
37//! .unwrap();
38//! ```
39//!
40//! 2. **Automatic Parameter Estimation**: A specific codec family is chosen, but
41//! the optimal parameter is determined by the builder based on a full data
42//! analysis. This is achieved by providing `None` as the parameter value.
43//! - Example: `Rice { log2_b: None }` will find the best `log2_b` for the
44//! given data.
45//!
46//! ```
47//! use compressed_intvec::prelude::*;
48//!
49//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
50//!
51//! // Automatically select the best Rice parameter
52//! let rice_vec: UIntVec<u32> = IntVec::builder()
53//! .codec(VariableCodecSpec::Rice { log2_b: None })
54//! .build(&data)
55//! .unwrap();
56//! ```
57//!
58//! 3. **Fully Automatic Selection**: The builder analyzes the data against all
59//! available codecs and their standard parameter ranges to find the single
60//! best configuration. This is activated by using [`VariableCodecSpec::Auto`].
61//!
62//! ```
63//! use compressed_intvec::prelude::*;
64//!
65//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
66//! // Automatically select the best codec and parameters for the data
67//! let auto_vec: UIntVec<u32> = IntVec::builder()
68//! .codec(VariableCodecSpec::Auto)
69//! .build(&data)
70//! .unwrap();
71//! ```
72//!
73//!
74//! ## Analysis Mechanism
75//!
76//! The selection logic uses the [`CodesStats`] utility from the [`dsi-bitstream`]
77//! crate. For a given sequence of integers, [`CodesStats`] calculates the exact
78//! total bit cost for encoding the sequence with a wide range of instantaneous
79//! codes and their common parameterizations.
80//!
81//! ## Construction Overhead
82//!
83//! The full-dataset analysis has a one-time computational cost at construction.
84//! The complexity is `O(N * C)`, where `N` is the number of elements in the
85//! input and `C` is the number of codec configurations tested by [`CodesStats`]
86//! (approximately 70).
87//!
88//! This trade-off is suitable for read-heavy workloads where a higher initial
89//! cost is acceptable for better compression and subsequent read performance.
90//!
91//! # Implementation Notes
92//!
93//! - The parameter ranges for codecs like Zeta and Rice are defined by the `const
94//! generics` of the [`CodesStats`] struct in [`dsi-bitstream`]. The default
95//! values cover common and effective parameter ranges.
96//! - If a data distribution benefits from a parameter outside of the tested
97//! range (e.g., Zeta with `k=20`), it must be specified explicitly in the
98//! builder via `.codec(VariableCodecSpec::Zeta { k: Some(20) })`.
99//!
100//! [`IntVec`]: crate::variable::IntVec
101//! [`IntVecBuilder`]: crate::variable::builder::IntVecBuilder
102//! [`dsi-bitstream`]: https://crates.io/crates/dsi-bitstream
103
104use super::IntVecError;
105use dsi_bitstream::prelude::{Codes, CodesStats};
106
107/// Specifies the compression codec and its parameters for an [`IntVec`](super::IntVec).
108///
109/// This enum allows for either explicitly setting the parameters for codes
110/// like Rice and Zeta, or requesting that the [`IntVecBuilder`](super::builder::IntVecBuilder)
111/// automatically select suitable parameters by performing a full analysis of
112/// the data distribution.
113#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
114pub enum VariableCodecSpec {
115 /// Elias γ-coding. A simple, universal code that is effective for integers
116 /// with a distribution skewed towards small values. It is the default codec
117 /// for the iterator-based builder, which cannot perform data analysis.
118 #[default]
119 Gamma,
120
121 /// Elias δ-coding. A universal code that is generally more efficient than
122 /// Gamma for larger integer values.
123 Delta,
124
125 /// Unary coding. Encodes an integer `n` as `n` zeros followed by a one. It is
126 /// only efficient for extremely small values (e.g., 0, 1, 2).
127 Unary,
128
129 /// Rice-coding with a parameter `log2_b`. This code is optimal for data with
130 /// a geometric distribution.
131 ///
132 /// - If `log2_b` is `Some(val)`, the specified parameter is used.
133 /// - If `log2_b` is `None`, an optimal parameter is estimated by analyzing the
134 /// entire dataset.
135 Rice { log2_b: Option<u8> },
136
137 /// Boldi-Vigna ζ-coding with a parameter `k`. This code is effective for
138 /// data with a power-law distribution, common in web graphs and social networks.
139 ///
140 /// - If `k` is `Some(val)`, the specified parameter is used (`k > 0`).
141 /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
142 /// entire dataset.
143 Zeta { k: Option<u64> },
144
145 /// Golomb-coding with a parameter `b`. This is a generalization of Rice coding
146 /// and is also suitable for geometric distributions.
147 ///
148 /// - If `b` is `Some(val)`, the specified parameter is used (`b > 0`).
149 /// - If `b` is `None`, an optimal parameter is estimated by analyzing the
150 /// entire dataset.
151 Golomb { b: Option<u64> },
152
153 /// Elias-Fano ω-coding. A universal code.
154 Omega,
155
156 /// Streamlined Apostolico–Drovandi π code with a parameter `k`.
157 ///
158 /// - If `k` is `Some(val)`, the specified parameter is used (`k > 0`).
159 /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
160 /// entire dataset.
161 Pi { k: Option<u64> },
162
163 /// Elias-Fano Exponential-Golomb coding with a parameter `k`.
164 ///
165 /// - If `k` is `Some(val)`, the specified parameter is used.
166 /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
167 /// entire dataset.
168 ExpGolomb { k: Option<u64> },
169
170 /// VByte encoding with Little-Endian byte order. This is often one of the
171 /// fastest codecs for decoding, though it may not offer the best compression.
172 VByteLe,
173
174 /// VByte encoding with Big-Endian byte order.
175 VByteBe,
176
177 /// Automatically select the best variable-length code based on the data.
178 ///
179 /// When this option is used, the builder performs a statistical
180 /// analysis on the entire input dataset to determine which codec and
181 /// parameterization provides the best compression ratio.
182 ///
183 /// # Note
184 ///
185 /// This option is **not** supported for the iterator-based builder,
186 /// as it requires pre-analyzing the data.
187 Auto,
188
189 /// Use an explicitly provided [`Codes`] variant from [`dsi-bitstream`](https://crates.io/crates/dsi-bitstream).
190 ///
191 /// This is for advanced use cases where the user has already constructed
192 /// a [`Codes`] enum instance.
193 Explicit(Codes),
194}
195
196/// Resolves a user-provided [`VariableCodecSpec`] into a concrete [`Codes`] variant.
197///
198/// This function translates the user's high-level request into a fully-parameterized,
199/// concrete [`Codes`] variant that can be used for compression.
200///
201/// If the `spec` includes requests for automatic parameter selection (e.g., `Auto`
202/// or `Zeta { k: None }`), this function analyzes the **entire** provided `input`
203/// data slice to determine the optimal settings.
204pub(crate) fn resolve_codec<U>(input: &[U], spec: VariableCodecSpec) -> Result<Codes, IntVecError>
205where
206 U: Into<u64> + Copy,
207{
208 // For an empty input, there's nothing to analyze. Return a safe default.
209 if input.is_empty() {
210 return Ok(Codes::Gamma);
211 }
212
213 match spec {
214 // Parameter-free codecs are a direct mapping.
215 VariableCodecSpec::Gamma => Ok(Codes::Gamma),
216 VariableCodecSpec::Delta => Ok(Codes::Delta),
217 VariableCodecSpec::Unary => Ok(Codes::Unary),
218 VariableCodecSpec::Omega => Ok(Codes::Omega),
219 VariableCodecSpec::VByteLe => Ok(Codes::VByteLe),
220 VariableCodecSpec::VByteBe => Ok(Codes::VByteBe),
221 VariableCodecSpec::Explicit(codes) => Ok(codes),
222
223 // Codecs with optional, user-provided parameters.
224 VariableCodecSpec::Rice { log2_b: Some(p) } => Ok(Codes::Rice { log2_b: p as usize }),
225 VariableCodecSpec::Zeta { k: Some(p) } => Ok(Codes::Zeta { k: p as usize }),
226 VariableCodecSpec::Golomb { b: Some(p) } => Ok(Codes::Golomb { b: p as usize }),
227 VariableCodecSpec::Pi { k: Some(p) } => Ok(Codes::Pi { k: p as usize }),
228 VariableCodecSpec::ExpGolomb { k: Some(p) } => Ok(Codes::ExpGolomb { k: p as usize }),
229
230 // Codecs that require data analysis. We group them to compute stats only once.
231 VariableCodecSpec::Auto
232 | VariableCodecSpec::Rice { log2_b: None }
233 | VariableCodecSpec::Zeta { k: None }
234 | VariableCodecSpec::Golomb { b: None }
235 | VariableCodecSpec::Pi { k: None }
236 | VariableCodecSpec::ExpGolomb { k: None } => {
237 // Define a type alias for the default [`CodesStats`] configuration for clarity.
238 // These const generics define the range of parameters to test.
239 type DefaultCodesStats = CodesStats<10, 20, 10, 10, 10>;
240
241 // Create a stats object and populate it by iterating through the entire dataset.
242 let mut stats = DefaultCodesStats::default();
243 for &value in input {
244 stats.update(value.into());
245 }
246
247 // Use the populated stats to resolve the codec specification.
248 match spec {
249 VariableCodecSpec::Auto => {
250 let (best_code, _) = stats.best_code();
251 Ok(best_code)
252 }
253 VariableCodecSpec::Rice { log2_b: None } => {
254 let (best_param, _) = stats
255 .rice
256 .iter()
257 .enumerate()
258 .min_by_key(|&(_, cost)| cost)
259 .unwrap_or((0, &0)); // Fallback to 0 if array is empty.
260 Ok(Codes::Rice { log2_b: best_param })
261 }
262 VariableCodecSpec::Zeta { k: None } => {
263 let (best_param, _) = stats
264 .zeta
265 .iter()
266 .enumerate()
267 .min_by_key(|&(_, cost)| cost)
268 .unwrap_or((0, &0));
269 Ok(Codes::Zeta { k: best_param + 1 }) // Zeta params are 1-based.
270 }
271 VariableCodecSpec::Golomb { b: None } => {
272 let (best_param, _) = stats
273 .golomb
274 .iter()
275 .enumerate()
276 .min_by_key(|&(_, cost)| cost)
277 .unwrap_or((0, &0));
278 Ok(Codes::Golomb { b: best_param + 1 }) // Golomb params are 1-based.
279 }
280 VariableCodecSpec::Pi { k: None } => {
281 let (best_param, _) = stats
282 .pi
283 .iter()
284 .enumerate()
285 .min_by_key(|&(_, cost)| cost)
286 .unwrap_or((0, &0));
287 Ok(Codes::Pi { k: best_param + 2 }) // Pi params are offset by 2.
288 }
289 VariableCodecSpec::ExpGolomb { k: None } => {
290 let (best_param, _) = stats
291 .exp_golomb
292 .iter()
293 .enumerate()
294 .min_by_key(|&(_, cost)| cost)
295 .unwrap_or((0, &0));
296 Ok(Codes::ExpGolomb { k: best_param })
297 }
298 // This arm is guaranteed to be unreachable because the outer match
299 // ensures `spec` is one of the variants handled above.
300 _ => unreachable!(),
301 }
302 }
303 }
304}