compressed_intvec/codec_spec.rs
1//! # Codec Specification and Strategy Selection
2//!
3//! This module defines the mechanisms for selecting and configuring the
4//! compression strategy for an [`IntVec`]. The choice of codec is critical, as
5//! its effectiveness is highly dependent on the statistical properties of the
6//! data being compressed.
7//!
8//! ## Encoding Strategies
9//!
10//! The library supports two fundamental encoding families:
11//!
12//! 1. **Variable-Length Instantaneous Codes**: Sourced from the [`dsi-bitstream`]
13//! crate, these codes (e.g., Gamma, Delta, Zeta) are designed to compress
14//! integers by using shorter bit sequences for more frequent values, making
15//! them ideal for skewed data distributions.
16//!
17//! 2. **Fixed-Width Integer Encoding**: This strategy uses the same number of
18//! bits for every integer. It is optimal for data that is uniformly
19//! distributed within a known range, providing the fastest possible random
20//! access.
21//!
22//! ## The [`CodecSpec`] Enum
23//!
24//! The primary user-facing API for this module is the [`CodecSpec`] enum. It
25//! provides a high-level interface for specifying the desired compression
26//! strategy, allowing for:
27//! - Direct selection of a parameter-free codec (e.g., `Gamma`).
28//! - Explicit parameterization of tunable codecs (e.g., `Zeta { k: Some(3) }`).
29//! - Automatic parameter selection, where the library analyzes the data to find
30//! the optimal configuration (e.g., `Auto` or `FixedLength { num_bits: None }`).
31//!
32//! The [`resolve_codec`] function translates a user's [`CodecSpec`] into a concrete
33//! [`Encoding`] variant that the [`IntVec`] can use for its internal operations.
34//!
35//! [`IntVec`]: crate::intvec::IntVec
36//! [`dsi-bitstream`]: https://docs.rs/dsi-bitstream/latest/dsi_bitstream/
37
38use crate::intvec::IntVecError;
39use dsi_bitstream::dispatch::FuncCodeWriter;
40use dsi_bitstream::impls::{BufBitWriter, MemWordWriterVec};
41use dsi_bitstream::prelude::{Codes, CodesStats, BE};
42use mem_dbg::{CopyType, MemDbgImpl, MemSize, SizeFlags, True};
43
44/// Specifies the compression codec and its parameters for an [`IntVec`].
45///
46/// This enum allows for either explicitly setting the parameters for codes
47/// like Rice and Zeta, or requesting that [`IntVec`] automatically selects
48/// suitable parameters based on the data distribution during construction.
49///
50/// [`IntVec`]: crate::intvec::IntVec
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
52pub enum CodecSpec {
53 /// Use Elias γ-coding.
54 ///
55 /// The implied probability distribution is approximately `1 / (2*x^2)`.
56 /// This code is parameter-free and is generally effective for data distributions
57 /// skewed towards small values.
58 Gamma,
59 /// Use Elias δ-coding. This is the default codec spec.
60 ///
61 /// The implied probability distribution is approximately `1 / (2*x*log(x)^2)`.
62 /// Delta coding is also parameter-free and tends to be more efficient than
63 /// Gamma for larger integer values.
64 #[default]
65 Delta,
66 /// Use Unary coding.
67 ///
68 /// Represents the number `n` with `n` zeros followed by a one. It is only
69 /// efficient for very small integers, particularly `0` and `1`.
70 Unary,
71 /// Use Rice-coding.
72 ///
73 /// A special case of Golomb coding suitable for geometrically distributed data.
74 /// - If `log2_b` is `Some(val)`, uses the specified parameter.
75 /// - If `log2_b` is `None` (on slice-based builder), an optimal parameter
76 /// is estimated from the data's average value.
77 Rice { log2_b: Option<u8> },
78 /// Use Boldi-Vigna ζ-coding.
79 ///
80 /// The implied probability distribution is approximately `1 / x^(1 + 1/k)`.
81 /// This code is effective for power-law distributions.
82 /// - If `k` is `Some(val)`, uses the specified parameter (`k > 0`).
83 /// - If `k` is `None` (on slice-based builder), a default of `k=3` is used.
84 Zeta { k: Option<u64> },
85 /// Use Golomb-coding.
86 ///
87 /// Suitable for geometrically distributed data.
88 /// - If `b` is `Some(val)`, uses the specified parameter (`b > 0`).
89 /// - If `b` is `None` (on slice-based builder), an optimal parameter
90 /// is estimated from the data's average value.
91 Golomb { b: Option<u64> },
92 /// Use Elias-Fano ω-coding, a universal code for positive integers.
93 Omega,
94 /// Use an alternative universal code for positive integers.
95 /// - If `k` is `Some(val)`, uses the specified parameter (`k > 0`).
96 /// - If `k` is `None` (on slice-based builder), a default of `k=3` is used.
97 Pi { k: Option<u64> },
98 /// Use Elias-Fano Exponential-Golomb coding.
99 /// - If `k` is `Some(val)`, uses the specified parameter.
100 /// - If `k` is `None` (on slice-based builder), a default of `k=2` is used.
101 ExpGolomb { k: Option<u64> },
102 /// Use VByte encoding with Little-Endian byte order. Efficient for integers
103 /// that fit within a few bytes.
104 VByteLe,
105 /// Use VByte encoding with Big-Endian byte order.
106 VByteBe,
107 /// Use fixed-width integer encoding.
108 ///
109 /// Optimal for uniformly distributed data within a known range.
110 /// - If `num_bits` is `Some(val)`, uses the specified number of bits.
111 /// - If `num_bits` is `None` (on slice-based builder), the minimum bits
112 /// required for the largest value in the data is used.
113 FixedLength { num_bits: Option<u8> },
114 /// Automatically select the best variable-length code based on the data.
115 ///
116 /// This is the recommended default for the slice-based builder.
117 /// This option is **not** supported for the iterator-based builder.
118 Auto,
119 /// Use an explicitly provided code from the [dsi-bitstream](https://docs.rs/dsi-bitstream/latest/dsi_bitstream/) library enum.
120 ///
121 /// This is an escape hatch for advanced use cases or for codes not yet
122 /// directly enumerated in [`CodecSpec`].
123 Explicit(Codes),
124}
125
126/// Represents the chosen encoding strategy for an `IntVec`.
127///
128/// This is the concrete, resolved encoding that an `IntVec` instance stores and
129/// uses for its internal compression and decompression logic. It is the result
130/// of resolving a [`CodecSpec`].
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132pub enum Encoding {
133 /// A variable-length, bit-level instantaneous code from the [dsi-bitstream](https://docs.rs/dsi-bitstream/latest/dsi_bitstream/) library.
134 Dsi(Codes),
135 /// Fixed-width encoding, where each integer is stored with `num_bits`.
136 Fixed { num_bits: usize },
137}
138
139// Manual implementation because the derive macro fails on the `Codes` type
140// from the external `dsi-bitstream` crate.
141impl CopyType for Encoding {
142 type Copy = True;
143}
144
145// Manual implementation for `MemSize`. `Encoding` is a `Copy` enum and does not
146// own any heap memory, so its size is just its stack size.
147impl MemSize for Encoding {
148 fn mem_size(&self, _flags: SizeFlags) -> usize {
149 core::mem::size_of::<Self>()
150 }
151}
152
153// Manual implementation for `MemDbgImpl`. We use the default implementation for
154// `_mem_dbg_rec_on` which does not recurse, correctly treating this enum as a
155// leaf in the memory layout tree.
156impl MemDbgImpl for Encoding {}
157
158/// Resolves a user-provided [`CodecSpec`] into a concrete [`Encoding`] variant.
159///
160/// This function is the core of the codec selection mechanism. It translates the
161/// user's high-level request (the `spec`) into a fully-parameterized, concrete
162/// [`Encoding`] that can be used for compression.
163///
164/// If the `spec` includes requests for automatic parameter selection (e.g.,
165/// `CodecSpec::Auto` or variants with `None` parameters), this function analyzes
166/// the provided `input` data slice to determine the optimal settings.
167///
168/// # Arguments
169/// * `input`: The data slice used to determine optimal parameters for automatic
170/// selection. This is ignored for specs with fully-fixed parameters.
171/// * `spec`: The [`CodecSpec`] indicating the desired codec and parameter settings.
172///
173/// # Returns
174/// A `Result` containing the concrete [`Encoding`] variant or an
175/// [`IntVecError::InvalidParameters`] if the configuration is invalid.
176///
177/// # Heuristics and Justification for Automatic Selection
178///
179/// When a [`CodecSpec`] variant with `None` parameters or [`CodecSpec::Auto`] is
180/// provided, this function uses data-driven heuristics.
181///
182/// - **`CodecSpec::FixedLength { num_bits: None }`**: The function scans the
183/// entire `input` slice to find the maximum value. It then calculates the
184/// minimum number of bits required to represent this value. A full scan is
185/// necessary here to guarantee correctness.
186///
187/// - **`CodecSpec::Rice { log2_b: None }` / `CodecSpec::Golomb { b: None }`**:
188/// These codes are optimal for geometrically distributed data. This function
189/// computes the average of the `input` data to estimate the optimal parameter.
190///
191/// - **`CodecSpec::Zeta { k: None }`, `Pi { k: None }`, `ExpGolomb { k: None }`**:
192/// These fall back to reasonable default parameters (`k=3`, `k=3`, `k=2` respectively).
193///
194/// - **`CodecSpec::Auto`**: This triggers the most sophisticated heuristic. It uses
195/// a dynamic sampling strategy to balance analysis speed and accuracy:
196/// 1. **For small inputs (<= 10,000 elements)**, it analyzes the *entire* dataset.
197/// This gives a perfect statistical profile, ensuring the best possible codec
198/// choice without a noticeable performance penalty.
199/// 2. **For larger inputs**, it takes a **uniform sample** of ~10,000 elements
200/// by selecting values at regular intervals across the entire input slice.
201/// This provides a high-quality, representative sample while ensuring the
202/// analysis step remains extremely fast, regardless of input size.
203/// 3. Based on this analysis, it uses the [`CodesStats`] utility
204/// to select the variable-length code predicted to be the most space-efficient.
205pub fn resolve_codec(input: &[u64], spec: CodecSpec) -> Result<Encoding, IntVecError> {
206 match spec {
207 // Parameter-free codecs
208 CodecSpec::Gamma => Ok(Encoding::Dsi(Codes::Gamma)),
209 CodecSpec::Delta => Ok(Encoding::Dsi(Codes::Delta)),
210 CodecSpec::Unary => Ok(Encoding::Dsi(Codes::Unary)),
211 CodecSpec::Omega => Ok(Encoding::Dsi(Codes::Omega)),
212 CodecSpec::VByteLe => Ok(Encoding::Dsi(Codes::VByteLe)),
213 CodecSpec::VByteBe => Ok(Encoding::Dsi(Codes::VByteBe)),
214
215 // Passthrough for advanced usage
216 CodecSpec::Explicit(codes) => Ok(Encoding::Dsi(codes)),
217
218 // Codecs with optional parameters
219 CodecSpec::Rice { log2_b } => {
220 let final_log2_b = log2_b.unwrap_or_else(|| {
221 if input.is_empty() {
222 return 0;
223 }
224 let sum: u128 = input.iter().map(|&x| x as u128).sum();
225 let avg = sum as f64 / input.len() as f64;
226 if avg < 1.0 {
227 return 0;
228 }
229 let ideal_log2_b = (avg / std::f64::consts::LN_2).log2().round();
230 ideal_log2_b.clamp(0.0, 10.0) as u8
231 });
232 Ok(Encoding::Dsi(Codes::Rice {
233 log2_b: final_log2_b as usize,
234 }))
235 }
236
237 CodecSpec::Zeta { k } => {
238 let final_k = k.unwrap_or(3);
239 if final_k == 0 {
240 return Err(IntVecError::InvalidParameters(
241 "Zeta parameter k cannot be zero".to_string(),
242 ));
243 }
244 Ok(Encoding::Dsi(Codes::Zeta {
245 k: final_k as usize,
246 }))
247 }
248
249 CodecSpec::Golomb { b } => {
250 let final_b = b.unwrap_or_else(|| {
251 if input.is_empty() {
252 return 1;
253 }
254 let sum: u128 = input.iter().map(|&x| x as u128).sum();
255 let avg = sum as f64 / input.len() as f64;
256 // Heuristic for optimal Golomb parameter 'b'
257 (avg * 0.69).round().max(1.0) as u64
258 });
259 if final_b == 0 {
260 return Err(IntVecError::InvalidParameters(
261 "Golomb parameter b cannot be zero".to_string(),
262 ));
263 }
264 Ok(Encoding::Dsi(Codes::Golomb {
265 b: final_b as usize,
266 }))
267 }
268
269 CodecSpec::Pi { k } => {
270 let final_k = k.unwrap_or(3);
271 if final_k == 0 {
272 return Err(IntVecError::InvalidParameters(
273 "Pi parameter k cannot be zero".to_string(),
274 ));
275 }
276 Ok(Encoding::Dsi(Codes::Pi {
277 k: final_k as usize,
278 }))
279 }
280
281 CodecSpec::ExpGolomb { k } => {
282 let final_k = k.unwrap_or(2);
283 Ok(Encoding::Dsi(Codes::ExpGolomb {
284 k: final_k as usize,
285 }))
286 }
287
288 CodecSpec::FixedLength { num_bits } => {
289 let final_num_bits = num_bits.map(|n| n as usize).unwrap_or_else(|| {
290 let max_val = input.iter().max().copied().unwrap_or(0);
291 if max_val == 0 {
292 1
293 } else {
294 (u64::BITS - max_val.leading_zeros()) as usize
295 }
296 });
297
298 if final_num_bits > 64 {
299 return Err(IntVecError::InvalidParameters(
300 "FixedLength num_bits cannot be greater than 64".to_string(),
301 ));
302 }
303 Ok(Encoding::Fixed {
304 num_bits: final_num_bits,
305 })
306 }
307
308 CodecSpec::Auto => {
309 if input.is_empty() {
310 return Ok(Encoding::Dsi(Codes::Gamma));
311 }
312
313 const TARGET_SAMPLE_SIZE: usize = 10_000;
314 let mut stats = CodesStats::<10, 20, 10, 10, 10>::default();
315
316 if input.len() <= TARGET_SAMPLE_SIZE {
317 // For small inputs, analyze the entire dataset for perfect accuracy.
318 for &value in input {
319 stats.update(value);
320 }
321 } else {
322 // For large inputs, take a uniform sample to keep analysis fast.
323 let step = input.len() as f64 / TARGET_SAMPLE_SIZE as f64;
324 let sample_iter =
325 (0..TARGET_SAMPLE_SIZE).map(|i| input[((i as f64) * step) as usize]);
326 for value in sample_iter {
327 stats.update(value);
328 }
329 }
330
331 let (best_code, _) = stats.best_code();
332
333 // This check ensures that the selected code is supported by the writer.
334 // It acts as a safeguard against future changes in dsi-bitstream.
335 if FuncCodeWriter::<BE, BufBitWriter<BE, MemWordWriterVec<u64, Vec<u64>>>>::new(
336 best_code,
337 )
338 .is_ok()
339 {
340 Ok(Encoding::Dsi(best_code))
341 } else {
342 // Fallback to a safe, universally supported code if the best
343 // one is not available in the writer dispatch.
344 Ok(Encoding::Dsi(Codes::Delta))
345 }
346 }
347 }
348}