Skip to main content

grafeo_core/storage/
codec.rs

1//! Unified compression codec enumeration.
2//!
3//! Provides a unified interface for selecting and using compression codecs
4//! based on data type and characteristics.
5//!
6//! # Supported Codecs
7//!
8//! | Codec | Best For | Compression |
9//! |-------|----------|-------------|
10//! | None | Small data, random access | 1x |
11//! | Delta | Sorted integers | 2-10x |
12//! | DeltaBitPacked | Sorted integers with small deltas | 5-20x |
13//! | BitPacked | Small integers | 2-16x |
14//! | Dictionary | Strings with low cardinality | 2-50x |
15//! | BitVector | Booleans | 8x |
16//! | RunLength | Highly repetitive data | 2-100x |
17
18use std::io;
19
20use super::bitpack::{BitPackedInts, DeltaBitPacked};
21use super::bitvec::BitVector;
22
23/// Compression codec identifier.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub enum CompressionCodec {
26    /// No compression (raw values).
27    None,
28
29    /// Delta encoding for integers.
30    Delta,
31
32    /// Bit packing for small integers.
33    BitPacked {
34        /// Number of bits per value.
35        bits: u8,
36    },
37
38    /// Delta + bit packing for sorted integers.
39    DeltaBitPacked {
40        /// Number of bits per delta.
41        bits: u8,
42    },
43
44    /// Dictionary encoding for strings.
45    Dictionary,
46
47    /// Bit vector for booleans.
48    BitVector,
49
50    /// Run-length encoding for repeated values.
51    RunLength,
52}
53
54impl CompressionCodec {
55    /// Returns a human-readable name for the codec.
56    #[must_use]
57    pub fn name(&self) -> &'static str {
58        match self {
59            Self::None => "None",
60            Self::Delta => "Delta",
61            Self::BitPacked { .. } => "BitPacked",
62            Self::DeltaBitPacked { .. } => "DeltaBitPacked",
63            Self::Dictionary => "Dictionary",
64            Self::BitVector => "BitVector",
65            Self::RunLength => "RunLength",
66        }
67    }
68
69    /// Returns whether this codec is lossless.
70    #[must_use]
71    pub fn is_lossless(&self) -> bool {
72        // All our codecs are lossless
73        true
74    }
75}
76
77/// Compressed data container.
78///
79/// Holds compressed data along with metadata needed for decompression.
80#[derive(Debug, Clone)]
81pub struct CompressedData {
82    /// The codec used for compression.
83    pub codec: CompressionCodec,
84    /// Size of the original uncompressed data in bytes.
85    pub uncompressed_size: usize,
86    /// Compressed data bytes.
87    pub data: Vec<u8>,
88    /// Additional metadata for decompression.
89    pub metadata: CompressionMetadata,
90}
91
92/// Metadata needed for decompression.
93#[derive(Debug, Clone)]
94pub enum CompressionMetadata {
95    /// No metadata needed.
96    None,
97    /// Delta encoding metadata.
98    Delta {
99        /// Base value (first value in sequence).
100        base: i64,
101    },
102    /// Bit-packing metadata.
103    BitPacked {
104        /// Number of values.
105        count: usize,
106    },
107    /// Delta + bit-packing metadata.
108    DeltaBitPacked {
109        /// Base value.
110        base: i64,
111        /// Number of values.
112        count: usize,
113    },
114    /// Dictionary metadata.
115    Dictionary {
116        /// Dictionary identifier (for shared dictionaries).
117        dict_id: u32,
118    },
119    /// Run-length metadata.
120    RunLength {
121        /// Number of runs.
122        run_count: usize,
123    },
124}
125
126impl CompressedData {
127    /// Creates uncompressed data (no compression).
128    pub fn uncompressed(data: Vec<u8>) -> Self {
129        let size = data.len();
130        Self {
131            codec: CompressionCodec::None,
132            uncompressed_size: size,
133            data,
134            metadata: CompressionMetadata::None,
135        }
136    }
137
138    /// Returns the compression ratio (original / compressed).
139    #[must_use]
140    pub fn compression_ratio(&self) -> f64 {
141        if self.data.is_empty() {
142            return 1.0;
143        }
144        self.uncompressed_size as f64 / self.data.len() as f64
145    }
146
147    /// Returns whether the data is compressed.
148    #[must_use]
149    pub fn is_compressed(&self) -> bool {
150        !matches!(self.codec, CompressionCodec::None)
151    }
152}
153
154/// Automatic codec selection based on data characteristics.
155pub struct CodecSelector;
156
157impl CodecSelector {
158    /// Selects the best codec for a slice of u64 values.
159    #[must_use]
160    pub fn select_for_integers(values: &[u64]) -> CompressionCodec {
161        if values.is_empty() {
162            return CompressionCodec::None;
163        }
164
165        if values.len() < 8 {
166            // Not worth compressing very small arrays
167            return CompressionCodec::None;
168        }
169
170        // Check if sorted (ascending)
171        let is_sorted = values.windows(2).all(|w| w[0] <= w[1]);
172
173        if is_sorted {
174            // Calculate deltas
175            let deltas: Vec<u64> = values.windows(2).map(|w| w[1] - w[0]).collect();
176            let max_delta = deltas.iter().copied().max().unwrap_or(0);
177            let bits_needed = BitPackedInts::bits_needed(max_delta);
178
179            return CompressionCodec::DeltaBitPacked { bits: bits_needed };
180        }
181
182        // Not sorted - try simple bit-packing
183        let max_value = values.iter().copied().max().unwrap_or(0);
184        let bits_needed = BitPackedInts::bits_needed(max_value);
185
186        if bits_needed < 32 {
187            CompressionCodec::BitPacked { bits: bits_needed }
188        } else {
189            CompressionCodec::None
190        }
191    }
192
193    /// Selects the best codec for a slice of strings.
194    #[must_use]
195    pub fn select_for_strings(values: &[&str]) -> CompressionCodec {
196        if values.is_empty() || values.len() < 4 {
197            return CompressionCodec::None;
198        }
199
200        // Count unique values
201        let unique: std::collections::HashSet<_> = values.iter().collect();
202        let cardinality_ratio = unique.len() as f64 / values.len() as f64;
203
204        // Dictionary is effective when cardinality is low
205        if cardinality_ratio < 0.5 {
206            CompressionCodec::Dictionary
207        } else {
208            CompressionCodec::None
209        }
210    }
211
212    /// Selects the best codec for boolean values.
213    #[must_use]
214    pub fn select_for_booleans(_values: &[bool]) -> CompressionCodec {
215        // BitVector is always the best choice for booleans
216        CompressionCodec::BitVector
217    }
218}
219
220/// Compressor that handles all supported data types.
221pub struct TypeSpecificCompressor;
222
223impl TypeSpecificCompressor {
224    /// Compresses u64 values using the optimal codec.
225    pub fn compress_integers(values: &[u64]) -> CompressedData {
226        let codec = CodecSelector::select_for_integers(values);
227
228        match codec {
229            CompressionCodec::None => {
230                let mut data = Vec::with_capacity(values.len() * 8);
231                for &v in values {
232                    data.extend_from_slice(&v.to_le_bytes());
233                }
234                CompressedData {
235                    codec,
236                    uncompressed_size: values.len() * 8,
237                    data,
238                    metadata: CompressionMetadata::None,
239                }
240            }
241            CompressionCodec::DeltaBitPacked { bits } => {
242                let encoded = DeltaBitPacked::encode(values);
243                CompressedData {
244                    codec: CompressionCodec::DeltaBitPacked { bits },
245                    uncompressed_size: values.len() * 8,
246                    data: encoded.to_bytes(),
247                    metadata: CompressionMetadata::DeltaBitPacked {
248                        base: encoded.base() as i64,
249                        count: values.len(),
250                    },
251                }
252            }
253            CompressionCodec::BitPacked { bits } => {
254                let packed = BitPackedInts::pack(values);
255                CompressedData {
256                    codec: CompressionCodec::BitPacked { bits },
257                    uncompressed_size: values.len() * 8,
258                    data: packed.to_bytes(),
259                    metadata: CompressionMetadata::BitPacked {
260                        count: values.len(),
261                    },
262                }
263            }
264            _ => unreachable!("Unexpected codec for integers"),
265        }
266    }
267
268    /// Compresses i64 values using the optimal codec.
269    pub fn compress_signed_integers(values: &[i64]) -> CompressedData {
270        // Convert to u64 using zig-zag encoding
271        let zigzag: Vec<u64> = values
272            .iter()
273            .map(|&v| super::delta::zigzag_encode(v))
274            .collect();
275        Self::compress_integers(&zigzag)
276    }
277
278    /// Compresses boolean values.
279    pub fn compress_booleans(values: &[bool]) -> CompressedData {
280        let bitvec = BitVector::from_bools(values);
281        CompressedData {
282            codec: CompressionCodec::BitVector,
283            uncompressed_size: values.len(),
284            data: bitvec.to_bytes(),
285            metadata: CompressionMetadata::BitPacked {
286                count: values.len(),
287            },
288        }
289    }
290
291    /// Decompresses u64 values.
292    pub fn decompress_integers(data: &CompressedData) -> io::Result<Vec<u64>> {
293        match data.codec {
294            CompressionCodec::None => {
295                let mut values = Vec::with_capacity(data.data.len() / 8);
296                for chunk in data.data.chunks_exact(8) {
297                    values.push(u64::from_le_bytes(chunk.try_into().unwrap()));
298                }
299                Ok(values)
300            }
301            CompressionCodec::DeltaBitPacked { .. } => {
302                let encoded = DeltaBitPacked::from_bytes(&data.data)?;
303                Ok(encoded.decode())
304            }
305            CompressionCodec::BitPacked { .. } => {
306                let packed = BitPackedInts::from_bytes(&data.data)?;
307                Ok(packed.unpack())
308            }
309            _ => Err(io::Error::new(
310                io::ErrorKind::InvalidData,
311                "Invalid codec for integer decompression",
312            )),
313        }
314    }
315
316    /// Decompresses boolean values.
317    pub fn decompress_booleans(data: &CompressedData) -> io::Result<Vec<bool>> {
318        match data.codec {
319            CompressionCodec::BitVector => {
320                let bitvec = BitVector::from_bytes(&data.data)?;
321                Ok(bitvec.to_bools())
322            }
323            _ => Err(io::Error::new(
324                io::ErrorKind::InvalidData,
325                "Invalid codec for boolean decompression",
326            )),
327        }
328    }
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_codec_selection_sorted_integers() {
337        let sorted: Vec<u64> = (0..100).collect();
338        let codec = CodecSelector::select_for_integers(&sorted);
339        assert!(matches!(codec, CompressionCodec::DeltaBitPacked { .. }));
340    }
341
342    #[test]
343    fn test_codec_selection_small_integers() {
344        let small: Vec<u64> = vec![1, 5, 3, 7, 2, 4, 6, 8];
345        let codec = CodecSelector::select_for_integers(&small);
346        assert!(matches!(codec, CompressionCodec::BitPacked { .. }));
347    }
348
349    #[test]
350    fn test_codec_selection_strings() {
351        let repeated = vec!["a", "b", "a", "a", "b", "a", "c", "a"];
352        let codec = CodecSelector::select_for_strings(&repeated);
353        assert_eq!(codec, CompressionCodec::Dictionary);
354
355        let unique = vec!["a", "b", "c", "d", "e", "f", "g", "h"];
356        let codec = CodecSelector::select_for_strings(&unique);
357        assert_eq!(codec, CompressionCodec::None);
358    }
359
360    #[test]
361    fn test_codec_selection_booleans() {
362        let bools = vec![true, false, true];
363        let codec = CodecSelector::select_for_booleans(&bools);
364        assert_eq!(codec, CompressionCodec::BitVector);
365    }
366
367    #[test]
368    fn test_compress_decompress_sorted_integers() {
369        let values: Vec<u64> = (100..200).collect();
370        let compressed = TypeSpecificCompressor::compress_integers(&values);
371
372        assert!(matches!(
373            compressed.codec,
374            CompressionCodec::DeltaBitPacked { .. }
375        ));
376        assert!(compressed.compression_ratio() > 1.0);
377
378        let decompressed = TypeSpecificCompressor::decompress_integers(&compressed).unwrap();
379        assert_eq!(values, decompressed);
380    }
381
382    #[test]
383    fn test_compress_decompress_small_integers() {
384        let values: Vec<u64> = vec![5, 2, 7, 1, 9, 3, 8, 4, 6, 0];
385        let compressed = TypeSpecificCompressor::compress_integers(&values);
386
387        let decompressed = TypeSpecificCompressor::decompress_integers(&compressed).unwrap();
388        assert_eq!(values, decompressed);
389    }
390
391    #[test]
392    fn test_compress_decompress_booleans() {
393        let values = vec![true, false, true, true, false, false, true, false];
394        let compressed = TypeSpecificCompressor::compress_booleans(&values);
395
396        assert_eq!(compressed.codec, CompressionCodec::BitVector);
397
398        let decompressed = TypeSpecificCompressor::decompress_booleans(&compressed).unwrap();
399        assert_eq!(values, decompressed);
400    }
401
402    #[test]
403    fn test_compression_ratio() {
404        // 100 sequential values should compress well
405        let values: Vec<u64> = (1000..1100).collect();
406        let compressed = TypeSpecificCompressor::compress_integers(&values);
407
408        let ratio = compressed.compression_ratio();
409        assert!(ratio > 5.0, "Expected ratio > 5, got {}", ratio);
410    }
411
412    #[test]
413    fn test_codec_names() {
414        assert_eq!(CompressionCodec::None.name(), "None");
415        assert_eq!(CompressionCodec::Delta.name(), "Delta");
416        assert_eq!(CompressionCodec::BitPacked { bits: 4 }.name(), "BitPacked");
417        assert_eq!(
418            CompressionCodec::DeltaBitPacked { bits: 4 }.name(),
419            "DeltaBitPacked"
420        );
421        assert_eq!(CompressionCodec::Dictionary.name(), "Dictionary");
422        assert_eq!(CompressionCodec::BitVector.name(), "BitVector");
423        assert_eq!(CompressionCodec::RunLength.name(), "RunLength");
424    }
425}