compressed_intvec/variable/codec.rs
1//! Codec selection for variable-length integer compression.
2//!
3//! This module defines [`Codec`], an enum for controlling the
4//! compression strategy of an [`VarVec`]. The choice of codec is a critical
5//! performance parameter, as its effectiveness depends on the statistical
6//! properties of the data being compressed.
7//!
8//! # Codec Selection Strategy
9//!
10//! Codec selection is performed by a statistical analysis of the entire
11//! input dataset at construction time.
12//!
13//! The [`Codec`] enum provides several ways to specify the compression method:
14//!
15//! 1. **Explicit Specification**: A specific codec and all its parameters are
16//! provided. This is suitable when the data characteristics are known in
17//! advance.
18//! - Non-parametric examples: [`Gamma`](Codec::Gamma), [`Delta`](Codec::Delta).
19//! - Parametric example: `Zeta { k: Some(3) }`.
20//!
21//! ```
22//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! use compressed_intvec::prelude::*;
24//!
25//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
26//!
27//! // Explicitly specify a non-parametric codec
28//! let delta_vec: UVarVec<u32> = VarVec::builder()
29//! .codec(Codec::Delta)
30//! .k(16)
31//! .build(&data)?;
32//!
33//! // Explicitly specify a parametric codec with a fixed parameter
34//! let zeta_vec: UVarVec<u32> = VarVec::builder()
35//! .codec(Codec::Zeta { k: Some(3) })
36//! .build(&data)?;
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! 2. **Automatic Parameter Estimation**: A specific codec family is chosen, but
42//! the optimal parameter is determined by the builder based on a full data
43//! analysis. This is achieved by providing `None` as the parameter value.
44//! - Example: `Rice { log2_b: None }` will find the best `log2_b` for the
45//! given data.
46//!
47//! ```
48//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
49//! use compressed_intvec::prelude::*;
50//!
51//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
52//!
53//! // Automatically select the best Rice parameter
54//! let rice_vec: UVarVec<u32> = VarVec::builder()
55//! .codec(Codec::Rice { log2_b: None })
56//! .build(&data)?;
57//! # Ok(())
58//! # }
59//! ```
60//!
61//! 3. **Fully Automatic Selection**: The builder analyzes the data against all
62//! available codecs and their standard parameter ranges to find the single
63//! best configuration. This is activated by using [`Codec::Auto`].
64//!
65//! ```
66//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
67//! use compressed_intvec::prelude::*;
68//!
69//! let data: &[u32] = &(0..1000).collect::<Vec<_>>();
70//! // Automatically select the best codec and parameters for the data
71//! let auto_vec: UVarVec<u32> = VarVec::builder()
72//! .codec(Codec::Auto)
73//! .build(&data)?;
74//! # Ok(())
75//! # }
76//! ```
77//!
78//!
79//! ## Analysis Mechanism
80//!
81//! The selection logic uses the [`CodesStats`] utility from the [`dsi-bitstream`]
82//! crate. For a given sequence of integers, [`CodesStats`] calculates the exact
83//! total bit cost for encoding the sequence with a wide range of instantaneous
84//! codes and their common parameterizations.
85//!
86//! ## Construction Overhead
87//!
88//! The full-dataset analysis has a one-time computational cost at construction.
89//! The complexity is `O(N * C)`, where `N` is the number of elements in the
90//! input and `C` is the number of codec configurations tested by [`CodesStats`]
91//! (approximately 70).
92//!
93//! This trade-off is suitable for read-heavy workloads where a higher initial
94//! cost is acceptable for better compression and subsequent read performance.
95//!
96//! # Implementation Notes
97//!
98//! - The parameter ranges for codecs like Zeta and Rice are defined by the `const
99//! generics` of the [`CodesStats`] struct in [`dsi-bitstream`]. The default
100//! values cover common and effective parameter ranges.
101//! - If a data distribution benefits from a parameter outside of the tested
102//! range (e.g., Zeta with `k=20`), it must be specified explicitly in the
103//! builder via `.codec(Codec::Zeta { k: Some(20) })`.
104//!
105//! [`VarVec`]: crate::variable::VarVec
106//! [`VarVecBuilder`]: crate::variable::builder::VarVecBuilder
107//! [`dsi-bitstream`]: https://crates.io/crates/dsi-bitstream
108
109use super::VarVecError;
110use dsi_bitstream::prelude::{Codes, CodesStats};
111
112/// Specifies the compression codec and its parameters for an [`VarVec`](super::VarVec).
113///
114/// This enum allows for either explicitly setting the parameters for codes
115/// like Rice and Zeta, or requesting that the [`VarVecBuilder`](super::builder::VarVecBuilder)
116/// automatically select suitable parameters by performing a full analysis of
117/// the data distribution.
118#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
119pub enum Codec {
120 /// Elias γ-coding. A simple, universal code that is effective for integers
121 /// with a distribution skewed towards small values. It is the default codec
122 /// for the iterator-based builder, which cannot perform data analysis.
123 #[default]
124 Gamma,
125
126 /// Elias δ-coding. A universal code that is generally more efficient than
127 /// Gamma for larger integer values.
128 Delta,
129
130 /// Unary coding. Encodes an integer `n` as `n` zeros followed by a one. It is
131 /// only efficient for extremely small values (e.g., 0, 1, 2).
132 Unary,
133
134 /// Rice-coding with a parameter `log2_b`. This code is optimal for data with
135 /// a geometric distribution.
136 ///
137 /// - If `log2_b` is `Some(val)`, the specified parameter is used.
138 /// - If `log2_b` is `None`, an optimal parameter is estimated by analyzing the
139 /// entire dataset.
140 Rice { log2_b: Option<u8> },
141
142 /// Boldi-Vigna ζ-coding with a parameter `k`. This code is effective for
143 /// data with a power-law distribution, common in web graphs and social networks.
144 ///
145 /// - If `k` is `Some(val)`, the specified parameter is used (`k > 0`).
146 /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
147 /// entire dataset.
148 Zeta { k: Option<u64> },
149
150 /// Golomb-coding with a parameter `b`. This is a generalization of Rice coding
151 /// and is also suitable for geometric distributions.
152 ///
153 /// - If `b` is `Some(val)`, the specified parameter is used (`b > 0`).
154 /// - If `b` is `None`, an optimal parameter is estimated by analyzing the
155 /// entire dataset.
156 Golomb { b: Option<u64> },
157
158 /// Elias-Fano ω-coding. A universal code.
159 Omega,
160
161 /// Streamlined Apostolico–Drovandi π code with a parameter `k`.
162 ///
163 /// - If `k` is `Some(val)`, the specified parameter is used (`k > 0`).
164 /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
165 /// entire dataset.
166 Pi { k: Option<u64> },
167
168 /// Elias-Fano Exponential-Golomb coding with a parameter `k`.
169 ///
170 /// - If `k` is `Some(val)`, the specified parameter is used.
171 /// - If `k` is `None`, an optimal parameter is estimated by analyzing the
172 /// entire dataset.
173 ExpGolomb { k: Option<u64> },
174
175 /// VByte encoding with Little-Endian byte order. This is often one of the
176 /// fastest codecs for decoding, though it may not offer the best compression.
177 VByteLe,
178
179 /// VByte encoding with Big-Endian byte order.
180 VByteBe,
181
182 /// Automatically select the best variable-length code based on the data.
183 ///
184 /// When this option is used, the builder performs a statistical
185 /// analysis on the entire input dataset to determine which codec and
186 /// parameterization provides the best compression ratio.
187 ///
188 /// # Note
189 ///
190 /// This option is **not** supported for the iterator-based builder,
191 /// as it requires pre-analyzing the data.
192 Auto,
193
194 /// Use an explicitly provided [`Codes`] variant from [`dsi-bitstream`](https://crates.io/crates/dsi-bitstream).
195 ///
196 /// This is for advanced use cases where the user has already constructed
197 /// a [`Codes`] enum instance.
198 Explicit(Codes),
199}
200
201/// Deprecated alias for [`Codec`]. Use [`Codec`] instead.
202///
203/// # Deprecation Notice
204///
205/// This type has been renamed to [`Codec`] for brevity and consistency.
206/// This alias will be removed in version 1.0.0.
207#[deprecated(since = "0.6.0", note = "renamed to `Codec`; use `Codec` instead")]
208pub type VariableCodecSpec = Codec;
209
210impl Codec {
211 /// Returns `true` if this codec specification requires data analysis to resolve.
212 ///
213 /// Analysis is needed when parameters are not explicitly specified and must
214 /// be determined by analyzing the data distribution.
215 #[inline]
216 pub(crate) fn requires_analysis(&self) -> bool {
217 matches!(
218 self,
219 Codec::Auto
220 | Codec::Rice { log2_b: None }
221 | Codec::Zeta { k: None }
222 | Codec::Golomb { b: None }
223 | Codec::Pi { k: None }
224 | Codec::ExpGolomb { k: None }
225 )
226 }
227}
228
229/// Resolves a user-provided [`Codec`] into a concrete [`Codes`] variant.
230///
231/// This function translates the user's high-level request into a fully-parameterized,
232/// concrete [`Codes`] variant that can be used for compression.
233///
234/// If the `spec` includes requests for automatic parameter selection (e.g., `Auto`
235/// or `Zeta { k: None }`), this function analyzes the **entire** provided `input`
236/// data slice to determine the optimal settings.
237pub(crate) fn resolve_codec<U>(input: &[U], spec: Codec) -> Result<Codes, VarVecError>
238where
239 U: Into<u64> + Copy,
240{
241 match spec {
242 // Parameter-free codecs: direct mapping, no data needed.
243 Codec::Gamma => Ok(Codes::Gamma),
244 Codec::Delta => Ok(Codes::Delta),
245 Codec::Unary => Ok(Codes::Unary),
246 Codec::Omega => Ok(Codes::Omega),
247 Codec::VByteLe => Ok(Codes::VByteLe),
248 Codec::VByteBe => Ok(Codes::VByteBe),
249 Codec::Explicit(codes) => Ok(codes),
250
251 // Codecs with explicit parameters: direct mapping.
252 Codec::Rice { log2_b: Some(p) } => Ok(Codes::Rice(p as usize)),
253 Codec::Zeta { k: Some(p) } => Ok(Codes::Zeta(p as usize)),
254 Codec::Golomb { b: Some(p) } => Ok(Codes::Golomb(p)),
255 Codec::Pi { k: Some(p) } => Ok(Codes::Pi(p as usize)),
256 Codec::ExpGolomb { k: Some(p) } => Ok(Codes::ExpGolomb(p as usize)),
257
258 // Codecs requiring analysis: return error if no data provided.
259 Codec::Auto
260 | Codec::Rice { log2_b: None }
261 | Codec::Zeta { k: None }
262 | Codec::Golomb { b: None }
263 | Codec::Pi { k: None }
264 | Codec::ExpGolomb { k: None } => {
265 if input.is_empty() {
266 return Ok(Codes::Gamma); // Safe default only for analysis codecs
267 }
268
269 // Define a type alias for the default [`CodesStats`] configuration for clarity.
270 // These const generics define the range of parameters to test.
271 type DefaultCodesStats = CodesStats<10, 20, 10, 10, 10>;
272
273 // Create a stats object and populate it by iterating through the entire dataset.
274 let mut stats = DefaultCodesStats::default();
275 for &value in input {
276 stats.update(value.into());
277 }
278
279 // Use the populated stats to resolve the codec specification.
280 match spec {
281 Codec::Auto => {
282 let (best_code, _) = stats.best_code();
283 Ok(best_code.canonicalize())
284 }
285 Codec::Rice { log2_b: None } => {
286 let (best_param, _) = stats
287 .rice
288 .iter()
289 .enumerate()
290 .min_by_key(|&(_, cost)| cost)
291 .unwrap_or((0, &0)); // Fallback to 0 if array is empty.
292 Ok(Codes::Rice(best_param))
293 }
294 Codec::Zeta { k: None } => {
295 let (best_param, _) = stats
296 .zeta
297 .iter()
298 .enumerate()
299 .min_by_key(|&(_, cost)| cost)
300 .unwrap_or((0, &0));
301 Ok(Codes::Zeta(best_param + 1)) // Zeta params are 1-based.
302 }
303 Codec::Golomb { b: None } => {
304 let (best_param, _) = stats
305 .golomb
306 .iter()
307 .enumerate()
308 .min_by_key(|&(_, cost)| cost)
309 .unwrap_or((0, &0));
310 Ok(Codes::Golomb((best_param + 1) as u64)) // Golomb params are 1-based.
311 }
312 Codec::Pi { k: None } => {
313 let (best_param, _) = stats
314 .pi
315 .iter()
316 .enumerate()
317 .min_by_key(|&(_, cost)| cost)
318 .unwrap_or((0, &0));
319 Ok(Codes::Pi(best_param + 2)) // Pi params are offset by 2.
320 }
321 Codec::ExpGolomb { k: None } => {
322 let (best_param, _) = stats
323 .exp_golomb
324 .iter()
325 .enumerate()
326 .min_by_key(|&(_, cost)| cost)
327 .unwrap_or((0, &0));
328 Ok(Codes::ExpGolomb(best_param))
329 }
330 // This arm is guaranteed to be unreachable because the outer match
331 // ensures `spec` is one of the variants handled above.
332 _ => unreachable!(),
333 }
334 }
335 }
336}
337
338/// Resolves a codec specification by analyzing data from an iterator.
339///
340/// This function avoids intermediate allocations by consuming the iterator
341/// directly. It should only be called when `spec.requires_analysis()` returns
342/// `true`.
343///
344/// # Arguments
345///
346/// * `iter` - An iterator of u64 values to analyze for codec parameter selection.
347///
348/// * `spec` - The codec specification requesting analysis (Auto or a codec with
349/// None parameters).
350pub(crate) fn resolve_codec_from_iter<I>(iter: I, spec: Codec) -> Result<Codes, VarVecError>
351where
352 I: Iterator<Item = u64>,
353{
354 // Define the default CodesStats configuration for parameter analysis.
355 type DefaultCodesStats = CodesStats<10, 20, 10, 10, 10>;
356
357 let mut stats = DefaultCodesStats::default();
358 let mut count = 0usize;
359
360 // Analyze the data stream without materializing it to a vector.
361 for value in iter {
362 stats.update(value);
363 count += 1;
364 }
365
366 // If the stream is empty, return a safe default.
367 if count == 0 {
368 return Ok(Codes::Gamma);
369 }
370
371 // Use the accumulated statistics to determine the best codec.
372 match spec {
373 Codec::Auto => {
374 let (best_code, _) = stats.best_code();
375 Ok(best_code.canonicalize())
376 }
377 Codec::Rice { log2_b: None } => {
378 let (best_param, _) = stats
379 .rice
380 .iter()
381 .enumerate()
382 .min_by_key(|&(_, cost)| cost)
383 .unwrap_or((0, &0));
384 Ok(Codes::Rice(best_param))
385 }
386 Codec::Zeta { k: None } => {
387 let (best_param, _) = stats
388 .zeta
389 .iter()
390 .enumerate()
391 .min_by_key(|&(_, cost)| cost)
392 .unwrap_or((0, &0));
393 Ok(Codes::Zeta(best_param + 1))
394 }
395 Codec::Golomb { b: None } => {
396 let (best_param, _) = stats
397 .golomb
398 .iter()
399 .enumerate()
400 .min_by_key(|&(_, cost)| cost)
401 .unwrap_or((0, &0));
402 Ok(Codes::Golomb((best_param + 1) as u64))
403 }
404 Codec::Pi { k: None } => {
405 let (best_param, _) = stats
406 .pi
407 .iter()
408 .enumerate()
409 .min_by_key(|&(_, cost)| cost)
410 .unwrap_or((0, &0));
411 Ok(Codes::Pi(best_param + 2))
412 }
413 Codec::ExpGolomb { k: None } => {
414 let (best_param, _) = stats
415 .exp_golomb
416 .iter()
417 .enumerate()
418 .min_by_key(|&(_, cost)| cost)
419 .unwrap_or((0, &0));
420 Ok(Codes::ExpGolomb(best_param))
421 }
422 _ => unreachable!("resolve_codec_from_iter called with non-analysis codec"),
423 }
424}