Skip to main content

oximedia_cache/
tier_compressor.rs

1//! Cache-entry compression for tiered L2+ cache layers.
2//!
3//! Large cache tiers (L2 memory or disk) can benefit from compressing stored
4//! values to reduce footprint.  This module provides a pure-Rust, zero-
5//! dependency LZ77-style byte-pair run-length codec suitable for media
6//! metadata and moderate-size frame buffers.
7//!
8//! # Design
9//!
10//! [`TierCompressor`] wraps a configurable compression level and exposes a
11//! symmetric [`compress`] / [`decompress`] pair.  The codec is a simplified
12//! LZ77 variant:
13//!
14//! - The input stream is divided into 256-byte look-ahead windows.
15//! - Literal bytes are emitted as `[0x00, byte]`.
16//! - Back-references `(offset, length)` with `length >= 3` are emitted as
17//!   `[0x01, offset_lo, offset_hi, length]` (offset = distance back into
18//!   the output buffer, length = match length).
19//! - A header `[0xCA, 0xCE, 0x00]` + 4-byte LE original length is prepended
20//!   so decompression can pre-allocate and validate.
21//!
22//! The codec is designed for correctness and minimal overhead, not maximum
23//! compression ratio.
24//!
25//! # Compression levels
26//!
27//! | Level | Look-ahead window | Search depth |
28//! |-------|-------------------|--------------|
29//! | 0     | 64                | 16           |
30//! | 1     | 128               | 32           |
31//! | 2     | 256               | 64           |
32//! | 3     | 512               | 128          |
33//!
34//! Higher levels produce smaller output but take more CPU time.
35//!
36//! # Example
37//!
38//! ```rust
39//! use oximedia_cache::tier_compressor::TierCompressor;
40//!
41//! let c = TierCompressor::new(1);
42//! let original = b"hello hello hello world".to_vec();
43//! let compressed = c.compress(&original).expect("compress");
44//! let restored = c.decompress(&compressed).expect("decompress");
45//! assert_eq!(restored, original);
46//! ```
47
48use thiserror::Error;
49
50// ── Magic / header ────────────────────────────────────────────────────────────
51
52const MAGIC: [u8; 3] = [0xCA, 0xCE, 0x00];
53const HEADER_LEN: usize = 3 + 4; // magic (3) + original_len LE-u32 (4)
54
55// ── Tags ──────────────────────────────────────────────────────────────────────
56
57const TAG_LITERAL: u8 = 0x00;
58const TAG_BACKREF: u8 = 0x01;
59
60// ── Errors ────────────────────────────────────────────────────────────────────
61
62/// Errors produced by [`TierCompressor`].
63#[derive(Debug, Error)]
64pub enum CompressorError {
65    /// The compressed data is truncated or has an invalid header.
66    #[error("invalid compressed data: {0}")]
67    InvalidData(String),
68    /// The decompressed output does not match the expected length in the header.
69    #[error("length mismatch: expected {expected}, got {actual}")]
70    LengthMismatch {
71        /// Expected decompressed length.
72        expected: usize,
73        /// Actual decompressed length.
74        actual: usize,
75    },
76    /// The input is too large to be encoded (> 2^32 bytes).
77    #[error("input too large: {0} bytes")]
78    InputTooLarge(usize),
79}
80
81// ── Level parameters ──────────────────────────────────────────────────────────
82
83/// Per-level codec parameters.
84struct LevelParams {
85    /// Number of bytes inspected in the look-back search window.
86    window: usize,
87    /// Maximum match length searched.
88    max_len: usize,
89}
90
91fn params_for_level(level: u8) -> LevelParams {
92    match level {
93        0 => LevelParams {
94            window: 64,
95            max_len: 16,
96        },
97        1 => LevelParams {
98            window: 128,
99            max_len: 32,
100        },
101        2 => LevelParams {
102            window: 256,
103            max_len: 64,
104        },
105        _ => LevelParams {
106            window: 512,
107            max_len: 128,
108        },
109    }
110}
111
112// ── TierCompressor ────────────────────────────────────────────────────────────
113
114/// Symmetric compressor/decompressor for cache tier entries.
115///
116/// Instantiate once and reuse for many compress/decompress calls.
117#[derive(Debug, Clone)]
118pub struct TierCompressor {
119    level: u8,
120}
121
122impl TierCompressor {
123    /// Create a new `TierCompressor` at the given level (0–3; values above 3
124    /// are treated as level 3).
125    #[must_use]
126    pub fn new(level: u8) -> Self {
127        Self {
128            level: level.min(3),
129        }
130    }
131
132    /// Compress `input` and return the compressed bytes.
133    ///
134    /// The compressed bytes include a header that encodes the original length;
135    /// pass them directly to [`decompress`] to recover the original.
136    pub fn compress(&self, input: &[u8]) -> Result<Vec<u8>, CompressorError> {
137        if input.len() > u32::MAX as usize {
138            return Err(CompressorError::InputTooLarge(input.len()));
139        }
140        let params = params_for_level(self.level);
141        let orig_len = input.len() as u32;
142
143        // Worst case: every byte is a literal (2 bytes each) + header.
144        let mut out = Vec::with_capacity(HEADER_LEN + input.len() * 2);
145
146        // Header: magic + original length (LE u32).
147        out.extend_from_slice(&MAGIC);
148        out.extend_from_slice(&orig_len.to_le_bytes());
149
150        let mut pos = 0usize;
151        while pos < input.len() {
152            // Search for the longest match in the look-back window.
153            let window_start = pos.saturating_sub(params.window);
154            let search_window = &input[window_start..pos];
155
156            // Maximum match length: min(params.max_len, remaining bytes).
157            let max_match = params.max_len.min(input.len() - pos);
158
159            let best = find_longest_match(search_window, &input[pos..], max_match);
160
161            match best {
162                Some((offset_in_window, length)) if length >= 3 => {
163                    // Back-reference.
164                    // `offset_in_window` is the index in `search_window` where the
165                    // match starts.  Convert to distance from current `pos`:
166                    // distance = (pos - window_start) - offset_in_window.
167                    let distance = (pos - window_start) - offset_in_window;
168                    debug_assert!(distance > 0, "back-ref distance must be positive");
169                    let dist_u16 = distance as u16;
170                    let len_u8 = length as u8;
171                    out.push(TAG_BACKREF);
172                    out.push((dist_u16 & 0xFF) as u8);
173                    out.push((dist_u16 >> 8) as u8);
174                    out.push(len_u8);
175                    pos += length;
176                }
177                _ => {
178                    // Literal byte.
179                    out.push(TAG_LITERAL);
180                    out.push(input[pos]);
181                    pos += 1;
182                }
183            }
184        }
185
186        Ok(out)
187    }
188
189    /// Decompress bytes previously produced by [`compress`].
190    pub fn decompress(&self, input: &[u8]) -> Result<Vec<u8>, CompressorError> {
191        if input.len() < HEADER_LEN {
192            return Err(CompressorError::InvalidData(format!(
193                "too short: {} bytes (need at least {})",
194                input.len(),
195                HEADER_LEN
196            )));
197        }
198        // Validate magic.
199        if input[..3] != MAGIC {
200            return Err(CompressorError::InvalidData(
201                "magic bytes do not match".to_string(),
202            ));
203        }
204        // Read original length.
205        let orig_len = u32::from_le_bytes([input[3], input[4], input[5], input[6]]) as usize;
206        let mut out = Vec::with_capacity(orig_len);
207
208        let payload = &input[HEADER_LEN..];
209        let mut pos = 0usize;
210
211        while pos < payload.len() {
212            let tag = payload[pos];
213            pos += 1;
214
215            match tag {
216                TAG_LITERAL => {
217                    if pos >= payload.len() {
218                        return Err(CompressorError::InvalidData(
219                            "truncated literal token".to_string(),
220                        ));
221                    }
222                    out.push(payload[pos]);
223                    pos += 1;
224                }
225                TAG_BACKREF => {
226                    if pos + 2 >= payload.len() {
227                        return Err(CompressorError::InvalidData(
228                            "truncated back-reference token".to_string(),
229                        ));
230                    }
231                    let dist_lo = payload[pos] as u16;
232                    let dist_hi = payload[pos + 1] as u16;
233                    let length = payload[pos + 2] as usize;
234                    pos += 3;
235
236                    let distance = (dist_hi << 8 | dist_lo) as usize;
237                    if distance == 0 || distance > out.len() {
238                        return Err(CompressorError::InvalidData(format!(
239                            "invalid back-ref distance {distance} at output position {}",
240                            out.len()
241                        )));
242                    }
243                    let start = out.len() - distance;
244                    // Copy byte-by-byte to allow overlapping references.
245                    for i in 0..length {
246                        let byte = out[start + (i % distance)];
247                        out.push(byte);
248                    }
249                }
250                unknown => {
251                    return Err(CompressorError::InvalidData(format!(
252                        "unknown tag byte 0x{unknown:02X} at offset {pos}"
253                    )));
254                }
255            }
256        }
257
258        if out.len() != orig_len {
259            return Err(CompressorError::LengthMismatch {
260                expected: orig_len,
261                actual: out.len(),
262            });
263        }
264
265        Ok(out)
266    }
267
268    /// Return the compression level (0–3).
269    #[must_use]
270    pub fn level(&self) -> u8 {
271        self.level
272    }
273
274    /// Compress `input` and report the compression ratio (compressed / original).
275    ///
276    /// Returns `1.0` for empty inputs.
277    pub fn compression_ratio(&self, input: &[u8]) -> Result<f64, CompressorError> {
278        if input.is_empty() {
279            return Ok(1.0);
280        }
281        let compressed = self.compress(input)?;
282        Ok(compressed.len() as f64 / input.len() as f64)
283    }
284}
285
286// ── find_longest_match ────────────────────────────────────────────────────────
287
288/// Search `window` for the longest prefix match against `lookahead`.
289///
290/// Returns `Some((window_start_index, match_length))` when a match of at
291/// least 3 bytes is found, `None` otherwise.
292fn find_longest_match(window: &[u8], lookahead: &[u8], max_len: usize) -> Option<(usize, usize)> {
293    if window.is_empty() || lookahead.is_empty() || max_len == 0 {
294        return None;
295    }
296
297    let mut best_offset = 0usize;
298    let mut best_len = 0usize;
299
300    for start in 0..window.len() {
301        let mut len = 0usize;
302        while len < max_len && len < lookahead.len() && len < window.len() - start + lookahead.len()
303        {
304            // Use modular indexing for overlapping matches (like LZ77 copy).
305            let window_idx = start + (len % (window.len() - start));
306            if window[window_idx] != lookahead[len] {
307                break;
308            }
309            len += 1;
310        }
311        if len > best_len {
312            best_len = len;
313            best_offset = start;
314        }
315    }
316
317    if best_len >= 3 {
318        Some((best_offset, best_len))
319    } else {
320        None
321    }
322}
323
324// ── Tests ─────────────────────────────────────────────────────────────────────
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    // 1. Round-trip: simple ASCII
331    #[test]
332    fn test_round_trip_ascii() {
333        let c = TierCompressor::new(1);
334        let orig = b"hello world hello world".to_vec();
335        let compressed = c.compress(&orig).expect("compress");
336        let restored = c.decompress(&compressed).expect("decompress");
337        assert_eq!(restored, orig);
338    }
339
340    // 2. Round-trip: empty input
341    #[test]
342    fn test_round_trip_empty() {
343        let c = TierCompressor::new(0);
344        let orig: Vec<u8> = Vec::new();
345        let compressed = c.compress(&orig).expect("compress empty");
346        let restored = c.decompress(&compressed).expect("decompress empty");
347        assert_eq!(restored, orig);
348    }
349
350    // 3. Round-trip: single byte
351    #[test]
352    fn test_round_trip_single_byte() {
353        let c = TierCompressor::new(0);
354        let orig = vec![0xABu8];
355        let compressed = c.compress(&orig).expect("compress");
356        let restored = c.decompress(&compressed).expect("decompress");
357        assert_eq!(restored, orig);
358    }
359
360    // 4. Round-trip: all-zero buffer (highly compressible)
361    #[test]
362    fn test_round_trip_zeros() {
363        let c = TierCompressor::new(2);
364        let orig = vec![0u8; 512];
365        let compressed = c.compress(&orig).expect("compress");
366        let restored = c.decompress(&compressed).expect("decompress");
367        assert_eq!(restored, orig);
368    }
369
370    // 5. Round-trip: random-ish binary data
371    #[test]
372    fn test_round_trip_binary() {
373        let c = TierCompressor::new(1);
374        let orig: Vec<u8> = (0u8..=255).cycle().take(300).collect();
375        let compressed = c.compress(&orig).expect("compress");
376        let restored = c.decompress(&compressed).expect("decompress");
377        assert_eq!(restored, orig);
378    }
379
380    // 6. Compressed zeros are smaller than the original
381    #[test]
382    fn test_zeros_compress_smaller() {
383        let c = TierCompressor::new(2);
384        let orig = vec![0u8; 256];
385        let compressed = c.compress(&orig).expect("compress");
386        // After the header, the body should be much smaller than 256 bytes
387        assert!(
388            compressed.len() < orig.len(),
389            "compressed {} should be < original {}",
390            compressed.len(),
391            orig.len()
392        );
393    }
394
395    // 7. Invalid magic bytes are rejected
396    #[test]
397    fn test_decompress_invalid_magic() {
398        let c = TierCompressor::new(0);
399        let bad = vec![0xDE, 0xAD, 0xBE, 0xEF, 0, 0, 0, 0];
400        assert!(c.decompress(&bad).is_err());
401    }
402
403    // 8. Truncated input is rejected
404    #[test]
405    fn test_decompress_truncated() {
406        let c = TierCompressor::new(0);
407        // Header requires 7 bytes; 3 is too short.
408        assert!(c.decompress(&[0xCA, 0xCE, 0x00]).is_err());
409    }
410
411    // 9. level() getter
412    #[test]
413    fn test_level_getter() {
414        assert_eq!(TierCompressor::new(2).level(), 2);
415        assert_eq!(TierCompressor::new(99).level(), 3); // clamped
416    }
417
418    // 10. compression_ratio() for empty input returns 1.0
419    #[test]
420    fn test_ratio_empty() {
421        let c = TierCompressor::new(1);
422        let ratio = c.compression_ratio(&[]).expect("ratio");
423        assert!((ratio - 1.0).abs() < 1e-9);
424    }
425
426    // 11. compression_ratio for repeated data < 1.0
427    #[test]
428    fn test_ratio_repeated_lt_1() {
429        let c = TierCompressor::new(3);
430        let orig = b"abcabcabcabcabcabcabcabcabc".to_vec();
431        let ratio = c.compression_ratio(&orig).expect("ratio");
432        // Repeated pattern should compress reasonably
433        assert!(ratio < 2.0, "ratio {ratio} should be reasonable");
434    }
435
436    // 12. Round-trip at all levels
437    #[test]
438    fn test_round_trip_all_levels() {
439        let orig = b"the quick brown fox jumps over the lazy dog".repeat(5);
440        for level in 0..=3 {
441            let c = TierCompressor::new(level);
442            let compressed = c.compress(&orig).expect("compress");
443            let restored = c.decompress(&compressed).expect("decompress");
444            assert_eq!(restored, orig, "level {level} failed round-trip");
445        }
446    }
447
448    // 13. Header encodes original length correctly
449    #[test]
450    fn test_header_length_field() {
451        let c = TierCompressor::new(0);
452        let orig = b"abcde".to_vec();
453        let compressed = c.compress(&orig).expect("compress");
454        // Bytes 3..7 are the LE u32 original length.
455        let stored_len =
456            u32::from_le_bytes([compressed[3], compressed[4], compressed[5], compressed[6]])
457                as usize;
458        assert_eq!(stored_len, orig.len());
459    }
460
461    // 14. Overlapping back-reference (run-length fill)
462    #[test]
463    fn test_overlapping_backref() {
464        let c = TierCompressor::new(2);
465        // Pattern that creates overlapping matches: "aaaaaaaaa..."
466        let orig: Vec<u8> = vec![b'a'; 200];
467        let compressed = c.compress(&orig).expect("compress");
468        let restored = c.decompress(&compressed).expect("decompress");
469        assert_eq!(restored, orig);
470    }
471
472    // 15. Large input round-trip
473    #[test]
474    fn test_large_input_round_trip() {
475        let c = TierCompressor::new(1);
476        // 10 KB of repeating pattern
477        let orig: Vec<u8> = b"media frame data segment"
478            .iter()
479            .copied()
480            .cycle()
481            .take(10_240)
482            .collect();
483        let compressed = c.compress(&orig).expect("compress");
484        let restored = c.decompress(&compressed).expect("decompress");
485        assert_eq!(restored, orig);
486    }
487}