Skip to main content

grafeo_core/storage/
codec.rs

1//! Automatic codec selection and compression.
2//!
3//! Don't want to think about which codec to use? [`CodecSelector`] analyzes
4//! your data and picks the best one. Or use [`TypeSpecificCompressor`] to
5//! compress and decompress with a single call.
6//!
7//! | Codec | Best for | Typical savings |
8//! | ----- | -------- | --------------- |
9//! | None | Small data, random access | 1x (no compression) |
10//! | Delta | Sorted integers | 2-10x |
11//! | DeltaBitPacked | Sequential IDs, timestamps | 5-20x |
12//! | BitPacked | Small integers (ages, counts) | 2-16x |
13//! | Dictionary | Repeated strings (labels) | 2-50x |
14//! | BitVector | Booleans | 8x |
15//! | RunLength | Highly repetitive data | 2-100x |
16
17use std::io;
18
19use super::bitpack::{BitPackedInts, DeltaBitPacked};
20use super::bitvec::BitVector;
21use super::runlength::{RunLengthAnalyzer, RunLengthEncoding};
22
23/// Identifies which compression algorithm was used on a chunk of data.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub enum CompressionCodec {
26    /// No compression (raw values).
27    None,
28
29    /// Delta encoding for integers.
30    Delta,
31
32    /// Bit packing for small integers.
33    BitPacked {
34        /// Number of bits per value.
35        bits: u8,
36    },
37
38    /// Delta + bit packing for sorted integers.
39    DeltaBitPacked {
40        /// Number of bits per delta.
41        bits: u8,
42    },
43
44    /// Dictionary encoding for strings.
45    Dictionary,
46
47    /// Bit vector for booleans.
48    BitVector,
49
50    /// Run-length encoding for repeated values.
51    RunLength,
52}
53
54impl CompressionCodec {
55    /// Returns a human-readable name for the codec.
56    #[must_use]
57    pub fn name(&self) -> &'static str {
58        match self {
59            Self::None => "None",
60            Self::Delta => "Delta",
61            Self::BitPacked { .. } => "BitPacked",
62            Self::DeltaBitPacked { .. } => "DeltaBitPacked",
63            Self::Dictionary => "Dictionary",
64            Self::BitVector => "BitVector",
65            Self::RunLength => "RunLength",
66        }
67    }
68
69    /// Returns whether this codec is lossless.
70    #[must_use]
71    pub fn is_lossless(&self) -> bool {
72        // All our codecs are lossless
73        true
74    }
75}
76
77/// A compressed chunk of data with everything needed to decompress it.
78///
79/// Check [`compression_ratio()`](Self::compression_ratio) to see how much
80/// space you're saving.
81#[derive(Debug, Clone)]
82pub struct CompressedData {
83    /// The codec used for compression.
84    pub codec: CompressionCodec,
85    /// Size of the original uncompressed data in bytes.
86    pub uncompressed_size: usize,
87    /// Compressed data bytes.
88    pub data: Vec<u8>,
89    /// Additional metadata for decompression.
90    pub metadata: CompressionMetadata,
91}
92
93/// Metadata needed for decompression.
94#[derive(Debug, Clone)]
95pub enum CompressionMetadata {
96    /// No metadata needed.
97    None,
98    /// Delta encoding metadata.
99    Delta {
100        /// Base value (first value in sequence).
101        base: i64,
102    },
103    /// Bit-packing metadata.
104    BitPacked {
105        /// Number of values.
106        count: usize,
107    },
108    /// Delta + bit-packing metadata.
109    DeltaBitPacked {
110        /// Base value.
111        base: i64,
112        /// Number of values.
113        count: usize,
114    },
115    /// Dictionary metadata.
116    Dictionary {
117        /// Dictionary identifier (for shared dictionaries).
118        dict_id: u32,
119    },
120    /// Run-length metadata.
121    RunLength {
122        /// Number of runs.
123        run_count: usize,
124    },
125}
126
127impl CompressedData {
128    /// Creates uncompressed data (no compression).
129    pub fn uncompressed(data: Vec<u8>) -> Self {
130        let size = data.len();
131        Self {
132            codec: CompressionCodec::None,
133            uncompressed_size: size,
134            data,
135            metadata: CompressionMetadata::None,
136        }
137    }
138
139    /// Returns the compression ratio (original / compressed).
140    #[must_use]
141    pub fn compression_ratio(&self) -> f64 {
142        if self.data.is_empty() {
143            return 1.0;
144        }
145        self.uncompressed_size as f64 / self.data.len() as f64
146    }
147
148    /// Returns whether the data is compressed.
149    #[must_use]
150    pub fn is_compressed(&self) -> bool {
151        !matches!(self.codec, CompressionCodec::None)
152    }
153}
154
155/// Analyzes your data and picks the best compression codec.
156///
157/// Don't want to think about compression? Call [`select_for_integers()`](Self::select_for_integers)
158/// or [`select_for_strings()`](Self::select_for_strings) and we'll examine your data
159/// to pick the codec with the best compression ratio.
160pub struct CodecSelector;
161
162impl CodecSelector {
163    /// Selects the best codec for a slice of u64 values.
164    ///
165    /// Considers multiple codecs and picks the one with the best estimated
166    /// compression ratio:
167    /// - RunLength: Best for highly repetitive data (avg run length > 2)
168    /// - DeltaBitPacked: Best for sorted/sequential integers
169    /// - BitPacked: Best for small integers with limited range
170    #[must_use]
171    pub fn select_for_integers(values: &[u64]) -> CompressionCodec {
172        if values.is_empty() {
173            return CompressionCodec::None;
174        }
175
176        if values.len() < 8 {
177            // Not worth compressing very small arrays
178            return CompressionCodec::None;
179        }
180
181        // Check run-length encoding first (best for repetitive data)
182        let rle_ratio = RunLengthAnalyzer::estimate_ratio(values);
183        let avg_run_length = RunLengthAnalyzer::average_run_length(values);
184
185        // RLE is beneficial if avg run length > 2 (breaks even at 2)
186        if avg_run_length > 2.0 && rle_ratio > 1.5 {
187            return CompressionCodec::RunLength;
188        }
189
190        // Check if sorted (ascending)
191        let is_sorted = values.windows(2).all(|w| w[0] <= w[1]);
192
193        if is_sorted {
194            // Calculate deltas
195            let deltas: Vec<u64> = values.windows(2).map(|w| w[1] - w[0]).collect();
196            let max_delta = deltas.iter().copied().max().unwrap_or(0);
197            let bits_needed = BitPackedInts::bits_needed(max_delta);
198
199            // Estimate DeltaBitPacked ratio
200            let delta_ratio = 64.0 / bits_needed as f64;
201
202            // If RLE is still better, prefer it
203            if rle_ratio > delta_ratio && rle_ratio > 1.0 {
204                return CompressionCodec::RunLength;
205            }
206
207            return CompressionCodec::DeltaBitPacked { bits: bits_needed };
208        }
209
210        // Not sorted - try simple bit-packing
211        let max_value = values.iter().copied().max().unwrap_or(0);
212        let bits_needed = BitPackedInts::bits_needed(max_value);
213
214        // Estimate BitPacked ratio
215        let bitpack_ratio = if bits_needed > 0 {
216            64.0 / bits_needed as f64
217        } else {
218            1.0
219        };
220
221        // If RLE is better, prefer it
222        if rle_ratio > bitpack_ratio && rle_ratio > 1.0 {
223            return CompressionCodec::RunLength;
224        }
225
226        if bits_needed < 32 {
227            CompressionCodec::BitPacked { bits: bits_needed }
228        } else {
229            CompressionCodec::None
230        }
231    }
232
233    /// Selects the best codec for a slice of strings.
234    #[must_use]
235    pub fn select_for_strings(values: &[&str]) -> CompressionCodec {
236        if values.is_empty() || values.len() < 4 {
237            return CompressionCodec::None;
238        }
239
240        // Count unique values
241        let unique: std::collections::HashSet<_> = values.iter().collect();
242        let cardinality_ratio = unique.len() as f64 / values.len() as f64;
243
244        // Dictionary is effective when cardinality is low
245        if cardinality_ratio < 0.5 {
246            CompressionCodec::Dictionary
247        } else {
248            CompressionCodec::None
249        }
250    }
251
252    /// Selects the best codec for boolean values.
253    #[must_use]
254    pub fn select_for_booleans(_values: &[bool]) -> CompressionCodec {
255        // BitVector is always the best choice for booleans
256        CompressionCodec::BitVector
257    }
258}
259
260/// One-stop compression - picks the codec and compresses in a single call.
261///
262/// Use this when you just want to compress your data without worrying about
263/// which codec to use. It picks the best codec automatically.
264pub struct TypeSpecificCompressor;
265
266impl TypeSpecificCompressor {
267    /// Compresses u64 values using the optimal codec.
268    pub fn compress_integers(values: &[u64]) -> CompressedData {
269        let codec = CodecSelector::select_for_integers(values);
270
271        match codec {
272            CompressionCodec::None => {
273                let mut data = Vec::with_capacity(values.len() * 8);
274                for &v in values {
275                    data.extend_from_slice(&v.to_le_bytes());
276                }
277                CompressedData {
278                    codec,
279                    uncompressed_size: values.len() * 8,
280                    data,
281                    metadata: CompressionMetadata::None,
282                }
283            }
284            CompressionCodec::DeltaBitPacked { bits } => {
285                let encoded = DeltaBitPacked::encode(values);
286                CompressedData {
287                    codec: CompressionCodec::DeltaBitPacked { bits },
288                    uncompressed_size: values.len() * 8,
289                    data: encoded.to_bytes(),
290                    metadata: CompressionMetadata::DeltaBitPacked {
291                        base: encoded.base() as i64,
292                        count: values.len(),
293                    },
294                }
295            }
296            CompressionCodec::BitPacked { bits } => {
297                let packed = BitPackedInts::pack(values);
298                CompressedData {
299                    codec: CompressionCodec::BitPacked { bits },
300                    uncompressed_size: values.len() * 8,
301                    data: packed.to_bytes(),
302                    metadata: CompressionMetadata::BitPacked {
303                        count: values.len(),
304                    },
305                }
306            }
307            CompressionCodec::RunLength => {
308                let encoded = RunLengthEncoding::encode(values);
309                CompressedData {
310                    codec: CompressionCodec::RunLength,
311                    uncompressed_size: values.len() * 8,
312                    data: encoded.to_bytes(),
313                    metadata: CompressionMetadata::RunLength {
314                        run_count: encoded.run_count(),
315                    },
316                }
317            }
318            _ => unreachable!("Unexpected codec for integers"),
319        }
320    }
321
322    /// Compresses i64 values using the optimal codec.
323    pub fn compress_signed_integers(values: &[i64]) -> CompressedData {
324        // Convert to u64 using zig-zag encoding
325        let zigzag: Vec<u64> = values
326            .iter()
327            .map(|&v| super::delta::zigzag_encode(v))
328            .collect();
329        Self::compress_integers(&zigzag)
330    }
331
332    /// Compresses boolean values.
333    pub fn compress_booleans(values: &[bool]) -> CompressedData {
334        let bitvec = BitVector::from_bools(values);
335        CompressedData {
336            codec: CompressionCodec::BitVector,
337            uncompressed_size: values.len(),
338            data: bitvec.to_bytes(),
339            metadata: CompressionMetadata::BitPacked {
340                count: values.len(),
341            },
342        }
343    }
344
345    /// Decompresses u64 values.
346    pub fn decompress_integers(data: &CompressedData) -> io::Result<Vec<u64>> {
347        match data.codec {
348            CompressionCodec::None => {
349                let mut values = Vec::with_capacity(data.data.len() / 8);
350                for chunk in data.data.chunks_exact(8) {
351                    values.push(u64::from_le_bytes(chunk.try_into().unwrap()));
352                }
353                Ok(values)
354            }
355            CompressionCodec::DeltaBitPacked { .. } => {
356                let encoded = DeltaBitPacked::from_bytes(&data.data)?;
357                Ok(encoded.decode())
358            }
359            CompressionCodec::BitPacked { .. } => {
360                let packed = BitPackedInts::from_bytes(&data.data)?;
361                Ok(packed.unpack())
362            }
363            CompressionCodec::RunLength => {
364                let encoded = RunLengthEncoding::from_bytes(&data.data)?;
365                Ok(encoded.decode())
366            }
367            _ => Err(io::Error::new(
368                io::ErrorKind::InvalidData,
369                "Invalid codec for integer decompression",
370            )),
371        }
372    }
373
374    /// Decompresses boolean values.
375    pub fn decompress_booleans(data: &CompressedData) -> io::Result<Vec<bool>> {
376        match data.codec {
377            CompressionCodec::BitVector => {
378                let bitvec = BitVector::from_bytes(&data.data)?;
379                Ok(bitvec.to_bools())
380            }
381            _ => Err(io::Error::new(
382                io::ErrorKind::InvalidData,
383                "Invalid codec for boolean decompression",
384            )),
385        }
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392
393    #[test]
394    fn test_codec_selection_sorted_integers() {
395        let sorted: Vec<u64> = (0..100).collect();
396        let codec = CodecSelector::select_for_integers(&sorted);
397        assert!(matches!(codec, CompressionCodec::DeltaBitPacked { .. }));
398    }
399
400    #[test]
401    fn test_codec_selection_small_integers() {
402        let small: Vec<u64> = vec![1, 5, 3, 7, 2, 4, 6, 8];
403        let codec = CodecSelector::select_for_integers(&small);
404        assert!(matches!(codec, CompressionCodec::BitPacked { .. }));
405    }
406
407    #[test]
408    fn test_codec_selection_strings() {
409        let repeated = vec!["a", "b", "a", "a", "b", "a", "c", "a"];
410        let codec = CodecSelector::select_for_strings(&repeated);
411        assert_eq!(codec, CompressionCodec::Dictionary);
412
413        let unique = vec!["a", "b", "c", "d", "e", "f", "g", "h"];
414        let codec = CodecSelector::select_for_strings(&unique);
415        assert_eq!(codec, CompressionCodec::None);
416    }
417
418    #[test]
419    fn test_codec_selection_booleans() {
420        let bools = vec![true, false, true];
421        let codec = CodecSelector::select_for_booleans(&bools);
422        assert_eq!(codec, CompressionCodec::BitVector);
423    }
424
425    #[test]
426    fn test_compress_decompress_sorted_integers() {
427        let values: Vec<u64> = (100..200).collect();
428        let compressed = TypeSpecificCompressor::compress_integers(&values);
429
430        assert!(matches!(
431            compressed.codec,
432            CompressionCodec::DeltaBitPacked { .. }
433        ));
434        assert!(compressed.compression_ratio() > 1.0);
435
436        let decompressed = TypeSpecificCompressor::decompress_integers(&compressed).unwrap();
437        assert_eq!(values, decompressed);
438    }
439
440    #[test]
441    fn test_compress_decompress_small_integers() {
442        let values: Vec<u64> = vec![5, 2, 7, 1, 9, 3, 8, 4, 6, 0];
443        let compressed = TypeSpecificCompressor::compress_integers(&values);
444
445        let decompressed = TypeSpecificCompressor::decompress_integers(&compressed).unwrap();
446        assert_eq!(values, decompressed);
447    }
448
449    #[test]
450    fn test_compress_decompress_booleans() {
451        let values = vec![true, false, true, true, false, false, true, false];
452        let compressed = TypeSpecificCompressor::compress_booleans(&values);
453
454        assert_eq!(compressed.codec, CompressionCodec::BitVector);
455
456        let decompressed = TypeSpecificCompressor::decompress_booleans(&compressed).unwrap();
457        assert_eq!(values, decompressed);
458    }
459
460    #[test]
461    fn test_compression_ratio() {
462        // 100 sequential values should compress well
463        let values: Vec<u64> = (1000..1100).collect();
464        let compressed = TypeSpecificCompressor::compress_integers(&values);
465
466        let ratio = compressed.compression_ratio();
467        assert!(ratio > 5.0, "Expected ratio > 5, got {}", ratio);
468    }
469
470    #[test]
471    fn test_codec_names() {
472        assert_eq!(CompressionCodec::None.name(), "None");
473        assert_eq!(CompressionCodec::Delta.name(), "Delta");
474        assert_eq!(CompressionCodec::BitPacked { bits: 4 }.name(), "BitPacked");
475        assert_eq!(
476            CompressionCodec::DeltaBitPacked { bits: 4 }.name(),
477            "DeltaBitPacked"
478        );
479        assert_eq!(CompressionCodec::Dictionary.name(), "Dictionary");
480        assert_eq!(CompressionCodec::BitVector.name(), "BitVector");
481        assert_eq!(CompressionCodec::RunLength.name(), "RunLength");
482    }
483
484    #[test]
485    fn test_codec_selection_repetitive_integers() {
486        // Highly repetitive data should select RunLength
487        let repetitive: Vec<u64> = vec![1; 100];
488        let codec = CodecSelector::select_for_integers(&repetitive);
489        assert_eq!(codec, CompressionCodec::RunLength);
490
491        // Mix of repeated values
492        let mut mixed = vec![1u64; 30];
493        mixed.extend(vec![2u64; 30]);
494        mixed.extend(vec![3u64; 30]);
495        let codec = CodecSelector::select_for_integers(&mixed);
496        assert_eq!(codec, CompressionCodec::RunLength);
497    }
498
499    #[test]
500    fn test_compress_decompress_runlength() {
501        // Highly repetitive data
502        let values: Vec<u64> = vec![42; 1000];
503        let compressed = TypeSpecificCompressor::compress_integers(&values);
504
505        assert_eq!(compressed.codec, CompressionCodec::RunLength);
506        assert!(
507            compressed.compression_ratio() > 50.0,
508            "Expected ratio > 50, got {}",
509            compressed.compression_ratio()
510        );
511
512        let decompressed = TypeSpecificCompressor::decompress_integers(&compressed).unwrap();
513        assert_eq!(values, decompressed);
514    }
515
516    #[test]
517    fn test_compress_decompress_mixed_runs() {
518        // Multiple runs
519        let mut values = vec![1u64; 100];
520        values.extend(vec![2u64; 100]);
521        values.extend(vec![3u64; 100]);
522
523        let compressed = TypeSpecificCompressor::compress_integers(&values);
524
525        assert_eq!(compressed.codec, CompressionCodec::RunLength);
526        assert!(compressed.compression_ratio() > 10.0);
527
528        let decompressed = TypeSpecificCompressor::decompress_integers(&compressed).unwrap();
529        assert_eq!(values, decompressed);
530    }
531
532    #[test]
533    fn test_runlength_vs_delta_selection() {
534        // Sequential values should still prefer DeltaBitPacked over RunLength
535        let sequential: Vec<u64> = (0..100).collect();
536        let codec = CodecSelector::select_for_integers(&sequential);
537        assert!(matches!(codec, CompressionCodec::DeltaBitPacked { .. }));
538
539        // But constant values should prefer RunLength
540        let constant: Vec<u64> = vec![100; 100];
541        let codec = CodecSelector::select_for_integers(&constant);
542        assert_eq!(codec, CompressionCodec::RunLength);
543    }
544}