Skip to main content

rvf_runtime/
compress.rs

1//! Zero-dependency LZ77 compression for QR seed microkernels.
2//!
3//! Simple but effective: 4 KB sliding window, match lengths 3-10,
4//! literal runs up to 128 bytes. Typical WASM compression ratio: 1.4-2.5x.
5//!
6//! Wire format (SCF-1 — Seed Compression Format):
7//! - Header: 4 bytes (original size as LE u32)
8//! - Token stream:
9//!   - `0x00..=0x7F` (bit 7 clear): Literal run, count = byte + 1 (1-128)
10//!   - `0x80..=0xFF` (bit 7 set): Back-reference
11//!     - length = ((byte >> 4) & 0x07) + 3 (3-10)
12//!     - offset = ((byte & 0x0F) << 8) | next_byte + 1 (1-4096)
13
14/// Compression errors.
15#[derive(Debug, PartialEq)]
16pub enum CompressError {
17    /// Compressed data too short to contain header.
18    TooShort,
19    /// Compressed stream is truncated.
20    Truncated,
21    /// Back-reference offset exceeds output size.
22    InvalidOffset,
23    /// Decompressed size doesn't match header.
24    SizeMismatch { expected: usize, got: usize },
25}
26
27impl core::fmt::Display for CompressError {
28    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
29        match self {
30            CompressError::TooShort => write!(f, "compressed data too short"),
31            CompressError::Truncated => write!(f, "compressed stream truncated"),
32            CompressError::InvalidOffset => write!(f, "invalid back-reference offset"),
33            CompressError::SizeMismatch { expected, got } => {
34                write!(f, "size mismatch: expected {expected}, got {got}")
35            }
36        }
37    }
38}
39
40/// Hash a 3-byte trigram for the LZ77 hash table.
41#[inline]
42fn trigram_hash(a: u8, b: u8, c: u8) -> usize {
43    (((a as usize) << 4) ^ ((b as usize) << 2) ^ (c as usize)) & 0xFFF
44}
45
46/// Flush accumulated literals to the output.
47fn flush_literals(output: &mut Vec<u8>, literals: &[u8]) {
48    let mut offset = 0;
49    while offset < literals.len() {
50        let chunk = core::cmp::min(128, literals.len() - offset);
51        output.push((chunk - 1) as u8); // 0x00..=0x7F
52        output.extend_from_slice(&literals[offset..offset + chunk]);
53        offset += chunk;
54    }
55}
56
57/// Compress data using LZ77 with a 4 KB sliding window.
58///
59/// Returns the compressed payload prefixed with a 4-byte original-size header.
60pub fn compress(input: &[u8]) -> Vec<u8> {
61    let mut output = Vec::with_capacity(input.len());
62
63    // Header: original size (LE u32).
64    output.extend_from_slice(&(input.len() as u32).to_le_bytes());
65
66    if input.is_empty() {
67        return output;
68    }
69
70    // Hash table: maps trigram hash → most recent position.
71    let mut table = [0u32; 4096];
72    let mut literals: Vec<u8> = Vec::new();
73    let mut pos = 0;
74
75    while pos < input.len() {
76        let mut best_len = 0usize;
77        let mut best_offset = 0usize;
78
79        if pos + 3 <= input.len() {
80            let hash = trigram_hash(input[pos], input[pos + 1], input[pos + 2]);
81            let candidate = table[hash] as usize;
82            table[hash] = pos as u32;
83
84            if candidate < pos && pos - candidate <= 4096 {
85                let max_len = core::cmp::min(10, input.len() - pos);
86                let mut match_len = 0;
87                while match_len < max_len
88                    && input[candidate + match_len] == input[pos + match_len]
89                {
90                    match_len += 1;
91                }
92                if match_len >= 3 {
93                    best_len = match_len;
94                    best_offset = pos - candidate;
95                }
96            }
97        }
98
99        if best_len >= 3 {
100            // Flush any pending literals first.
101            flush_literals(&mut output, &literals);
102            literals.clear();
103
104            // Emit match token: 1LLL_OOOO OOOOOOOO
105            let len_code = (best_len - 3) as u8; // 0-7
106            let offset_val = (best_offset - 1) as u16; // 0-4095
107            let offset_hi = ((offset_val >> 8) & 0x0F) as u8;
108            let offset_lo = (offset_val & 0xFF) as u8;
109
110            output.push(0x80 | (len_code << 4) | offset_hi);
111            output.push(offset_lo);
112
113            // Update hash table for positions within the match.
114            for i in 1..best_len {
115                if pos + i + 3 <= input.len() {
116                    let h = trigram_hash(input[pos + i], input[pos + i + 1], input[pos + i + 2]);
117                    table[h] = (pos + i) as u32;
118                }
119            }
120
121            pos += best_len;
122        } else {
123            literals.push(input[pos]);
124            pos += 1;
125        }
126    }
127
128    // Flush remaining literals.
129    flush_literals(&mut output, &literals);
130
131    output
132}
133
134/// Decompress SCF-1 data back to original bytes.
135pub fn decompress(compressed: &[u8]) -> Result<Vec<u8>, CompressError> {
136    if compressed.len() < 4 {
137        return Err(CompressError::TooShort);
138    }
139
140    let original_size = u32::from_le_bytes([
141        compressed[0],
142        compressed[1],
143        compressed[2],
144        compressed[3],
145    ]) as usize;
146
147    let mut output = Vec::with_capacity(original_size);
148    let mut pos = 4;
149
150    while output.len() < original_size && pos < compressed.len() {
151        let control = compressed[pos];
152        pos += 1;
153
154        if control & 0x80 == 0 {
155            // Literal run.
156            let count = (control as usize) + 1;
157            if pos + count > compressed.len() {
158                return Err(CompressError::Truncated);
159            }
160            output.extend_from_slice(&compressed[pos..pos + count]);
161            pos += count;
162        } else {
163            // Back-reference.
164            if pos >= compressed.len() {
165                return Err(CompressError::Truncated);
166            }
167            let length = (((control >> 4) & 0x07) as usize) + 3;
168            let offset_hi = (control & 0x0F) as usize;
169            let offset_lo = compressed[pos] as usize;
170            pos += 1;
171            let offset = (offset_hi << 8 | offset_lo) + 1;
172
173            if offset > output.len() {
174                return Err(CompressError::InvalidOffset);
175            }
176
177            let start = output.len() - offset;
178            for i in 0..length {
179                let byte = output[start + i];
180                output.push(byte);
181            }
182        }
183    }
184
185    if output.len() != original_size {
186        return Err(CompressError::SizeMismatch {
187            expected: original_size,
188            got: output.len(),
189        });
190    }
191
192    Ok(output)
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn empty_round_trip() {
201        let compressed = compress(b"");
202        assert_eq!(compressed, [0, 0, 0, 0]); // Just the size header.
203        let decompressed = decompress(&compressed).unwrap();
204        assert!(decompressed.is_empty());
205    }
206
207    #[test]
208    fn short_literal_round_trip() {
209        let input = b"Hello, World!";
210        let compressed = compress(input);
211        let decompressed = decompress(&compressed).unwrap();
212        assert_eq!(&decompressed, input);
213    }
214
215    #[test]
216    fn repeated_data_compresses() {
217        // Highly repetitive data should compress well.
218        let input: Vec<u8> = (0..1000).map(|i| (i % 7) as u8).collect();
219        let compressed = compress(&input);
220        assert!(
221            compressed.len() < input.len(),
222            "compressed {} >= original {}",
223            compressed.len(),
224            input.len()
225        );
226        let decompressed = decompress(&compressed).unwrap();
227        assert_eq!(decompressed, input);
228    }
229
230    #[test]
231    fn wasm_like_data_compresses() {
232        // Simulate WASM module: lots of zero runs and repeated patterns.
233        let mut wasm = Vec::new();
234        // Magic + version.
235        wasm.extend_from_slice(&[0x00, 0x61, 0x73, 0x6D, 0x01, 0x00, 0x00, 0x00]);
236        // Repeated section patterns.
237        for _ in 0..100 {
238            wasm.extend_from_slice(&[0x01, 0x06, 0x01, 0x60, 0x01, 0x7F, 0x01, 0x7F]);
239        }
240        // Zero fill.
241        wasm.resize(wasm.len() + 500, 0x00);
242
243        let compressed = compress(&wasm);
244        assert!(
245            compressed.len() < wasm.len(),
246            "compressed {} >= original {}",
247            compressed.len(),
248            wasm.len()
249        );
250        let decompressed = decompress(&compressed).unwrap();
251        assert_eq!(decompressed, wasm);
252    }
253
254    #[test]
255    fn random_like_data_round_trips() {
256        // Incompressible data should still round-trip correctly.
257        let input: Vec<u8> = (0..500).map(|i| ((i * 131 + 17) % 256) as u8).collect();
258        let compressed = compress(&input);
259        let decompressed = decompress(&compressed).unwrap();
260        assert_eq!(decompressed, input);
261    }
262
263    #[test]
264    fn large_data_round_trip() {
265        let input: Vec<u8> = (0..8000).map(|i| ((i * 37 + i / 100) % 256) as u8).collect();
266        let compressed = compress(&input);
267        let decompressed = decompress(&compressed).unwrap();
268        assert_eq!(decompressed, input);
269    }
270
271    #[test]
272    fn all_zeros_compress_well() {
273        let input = vec![0u8; 4096];
274        let compressed = compress(&input);
275        // 4096 zeros with 4KB window and match length 10 should compress very well.
276        assert!(compressed.len() < input.len() / 2);
277        let decompressed = decompress(&compressed).unwrap();
278        assert_eq!(decompressed, input);
279    }
280
281    #[test]
282    fn decompress_truncated_fails() {
283        let compressed = compress(b"test data for truncation");
284        // Truncate the compressed data.
285        let truncated = &compressed[..compressed.len() / 2];
286        assert!(decompress(truncated).is_err());
287    }
288
289    #[test]
290    fn decompress_too_short_fails() {
291        assert_eq!(decompress(&[0, 0]), Err(CompressError::TooShort));
292    }
293
294    #[test]
295    fn compress_error_display() {
296        let e = CompressError::SizeMismatch {
297            expected: 100,
298            got: 50,
299        };
300        assert!(format!("{e}").contains("100"));
301    }
302
303    #[test]
304    fn exactly_128_byte_literal_run() {
305        // 128 unique bytes forces exactly one max-length literal run.
306        let input: Vec<u8> = (0..128).map(|i| (i * 2 + 1) as u8).collect();
307        let compressed = compress(&input);
308        let decompressed = decompress(&compressed).unwrap();
309        assert_eq!(decompressed, input);
310    }
311}