compcol 0.6.5

A no_std collection of compression algorithms behind a uniform streaming trait, gated per-algorithm by Cargo features.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#![allow(dead_code)] // rle2_inverse is unused but kept symmetric for tests

//! bzip2's two run-length passes.
//!
//! ## RLE-1 (pre-BWT)
//!
//! Run-length encode the **raw input** before BWT: any run of 4..=255
//! identical bytes is replaced with 4 copies of the byte followed by a
//! single "extra count" byte holding `(run_length - 4) in 0..=251`.
//! Runs of 1..=3 bytes are emitted as-is. The decoder must run the
//! inverse pass after BWT⁻¹/MTF⁻¹/RLE-2⁻¹.
//!
//! Why: bzip2's BWT is O(n log n) on the average sort cost; runs of
//! identical bytes blow that up to O(n²) on the comparison side
//! (suffixes starting inside a long run all share an arbitrarily long
//! prefix). Capping runs at 4-then-count keeps the suffix sort
//! manageable.
//!
//! ## RLE-2 (post-MTF, zero-only)
//!
//! After MTF, the symbol stream is dominated by zeros (because BWT
//! concentrates runs of the same byte and MTF turns each such run into
//! a long stretch of zeros). RLE-2 encodes runs of zeros as a pair of
//! synthetic symbols `RUNA = 0` and `RUNB = 1`:
//!
//! ```text
//! run_length = 1·(RUNA?1:2) + 2·(RUNA?1:2) + 4·(RUNA?1:2) + ... + 1
//! ```
//!
//! That is, each subsequent RUNA/RUNB in the run doubles the
//! contribution; RUNA contributes `1×2^k`, RUNB contributes `2×2^k`.
//! The final `+1` makes the encoding self-delimiting (no run of zero
//! length).
//!
//! All non-zero MTF indices `i` are emitted as the symbol `i + 1` (so
//! symbol 2 = MTF index 1, etc.); the post-RLE-2 alphabet is
//! `{ RUNA, RUNB, mtf+1 ... }` followed by an explicit end-of-block
//! marker `EOB = alphabet_size_after_rle2 - 1`.

extern crate alloc;
use alloc::vec::Vec;

// ─── RLE-1 (raw bytes, encoder side) ─────────────────────────────────────

/// Apply bzip2's RLE-1 pre-pass to `input`.
///
/// Replace any run of `R` (3 < R ≤ 255) identical bytes with 4 copies of
/// the byte followed by a single byte holding `R - 4` (0..=251). Runs of
/// 1, 2, 3, or 4 bytes are emitted verbatim — but a run of exactly 4
/// must still be followed by a zero "extra-count" byte, because the
/// decoder always reads the count byte after seeing the 4th identical
/// byte in a row.
pub(crate) fn rle1_forward(input: &[u8]) -> Vec<u8> {
    let mut out = Vec::with_capacity(input.len());
    let mut i = 0;
    while i < input.len() {
        let b = input[i];
        // Find run length, capped at 255 (the max we can encode in one
        // 4+count token).
        let mut run = 1usize;
        while i + run < input.len() && input[i + run] == b && run < 255 {
            run += 1;
        }
        if run < 4 {
            // Emit raw.
            for _ in 0..run {
                out.push(b);
            }
        } else {
            // 4 copies + 1 byte holding (run - 4).
            out.push(b);
            out.push(b);
            out.push(b);
            out.push(b);
            out.push((run - 4) as u8);
        }
        i += run;
    }
    out
}

/// Streaming RLE-1 encoder used by the block-builder.
///
/// Reference bzip2 sizes each block by its **post-RLE-1** length
/// (`nblock`), not by the count of raw input bytes — so on compressible
/// data it packs more raw bytes into a 900 KB block than a raw-input cap
/// would. To match that (and the reference's block count and ratio) we
/// feed raw bytes through this incremental encoder and cut a block once
/// the emitted length reaches the per-block cap.
///
/// The encoder tracks the in-progress run across `push` calls so a run
/// straddling a call boundary is encoded identically to the one-shot
/// [`rle1_forward`].
pub(crate) struct Rle1Encoder {
    out: Vec<u8>,
    /// Byte value of the current run (valid iff `run > 0`).
    run_byte: u8,
    /// Length of the current pending run (1..=255 while building).
    run: usize,
}

impl Rle1Encoder {
    pub(crate) fn new() -> Self {
        Self {
            out: Vec::new(),
            run_byte: 0,
            run: 0,
        }
    }

    /// Current emitted (post-RLE-1) length, **including** the bytes the
    /// pending run will contribute once flushed. Used by the caller to
    /// decide when a block is full.
    pub(crate) fn encoded_len(&self) -> usize {
        self.out.len() + Self::run_cost(self.run)
    }

    /// How many output bytes a finished run of `run` identical bytes
    /// costs: runs <4 are verbatim, runs >=4 cost 4 literals + 1 count.
    fn run_cost(run: usize) -> usize {
        if run == 0 {
            0
        } else if run < 4 {
            run
        } else {
            5
        }
    }

    /// Flush the pending run into `out`.
    fn flush_run(&mut self) {
        let b = self.run_byte;
        if self.run == 0 {
            return;
        }
        if self.run < 4 {
            for _ in 0..self.run {
                self.out.push(b);
            }
        } else {
            self.out.push(b);
            self.out.push(b);
            self.out.push(b);
            self.out.push(b);
            self.out.push((self.run - 4) as u8);
        }
        self.run = 0;
    }

    /// Feed one raw byte.
    pub(crate) fn push(&mut self, b: u8) {
        if self.run > 0 && b == self.run_byte && self.run < 255 {
            self.run += 1;
            return;
        }
        // Different byte, or the 255-cap reached: flush and start anew.
        self.flush_run();
        self.run_byte = b;
        self.run = 1;
    }

    /// Finish: flush the pending run and return the encoded block.
    pub(crate) fn finish(mut self) -> Vec<u8> {
        self.flush_run();
        self.out
    }
}

/// Invert bzip2's RLE-1 pre-pass. Streaming-friendly: consumes the full
/// `input` and returns the reconstituted raw bytes.
pub(crate) fn rle1_inverse(input: &[u8]) -> Vec<u8> {
    let mut out = Vec::with_capacity(input.len());
    let mut i = 0;
    while i < input.len() {
        let b = input[i];
        out.push(b);
        i += 1;
        if i >= input.len() || input[i] != b {
            continue;
        }
        out.push(b);
        i += 1;
        if i >= input.len() || input[i] != b {
            continue;
        }
        out.push(b);
        i += 1;
        if i >= input.len() || input[i] != b {
            continue;
        }
        // Fourth copy.
        out.push(b);
        i += 1;
        // Next byte is the extra-count.
        if i < input.len() {
            let extra = input[i] as usize;
            i += 1;
            for _ in 0..extra {
                out.push(b);
            }
        }
        // If we ran off the end before seeing the count byte, the
        // stream is malformed; let the caller's framing layer notice.
    }
    out
}

// ─── RLE-2 (post-MTF, zero-only) ─────────────────────────────────────────

/// Symbols emitted by RLE-2. Layout (encoder-side view):
/// - 0: RUNA  (contribution 1 × 2^k)
/// - 1: RUNB  (contribution 2 × 2^k)
/// - 2..: original MTF index `i ≥ 1` becomes the symbol `i + 1`
///
/// The caller adds the end-of-block marker (alphabet size - 1) after
/// the last symbol — that's outside RLE-2's purview.
///
/// Returns the symbol stream and the alphabet size used so far,
/// **without** the EOB marker counted.
pub(crate) fn rle2_forward(mtf_indices: &[u8], alphabet_size: usize) -> Vec<u16> {
    // alphabet_size = N = number of distinct bytes in the block. MTF
    // indices live in 0..N. RLE-2 maps:
    //   MTF index 0..=0 → RUNA/RUNB run
    //   MTF index 1..N → symbol (i + 1), i.e. 2..=N
    // Total symbols (excluding EOB) = N + 1; the EOB sits at index N+1,
    // so the full alphabet size on the Huffman side is N + 2.
    let _ = alphabet_size; // used only for documentation; we trust the input.

    let mut out: Vec<u16> = Vec::with_capacity(mtf_indices.len());
    let mut run: u32 = 0;
    for &i in mtf_indices {
        if i == 0 {
            run += 1;
        } else {
            // Flush any pending zero run.
            emit_run(&mut out, run);
            run = 0;
            // Non-zero MTF index `i` (1..=alphabet_size-1) becomes
            // symbol `i + 1` (2..=alphabet_size).
            out.push((i as u16) + 1);
        }
    }
    emit_run(&mut out, run);
    out
}

fn emit_run(out: &mut Vec<u16>, mut run: u32) {
    // bzip2 encodes (run + 1) in a sort of bijective base-2 using
    // {RUNA, RUNB}: bit value 1 → RUNA, bit value 2 → RUNB, low order
    // first, with the implicit final +1 lost on encode (handled by
    // the encoder writing (run + 1) and stripping the high "1" bit).
    //
    // The classical recipe (see bzip2's encoder pseudo-code):
    //   while run > 0:
    //     if run odd:
    //       emit RUNA; run = (run - 1) / 2
    //     else:
    //       emit RUNB; run = (run - 2) / 2
    if run == 0 {
        return;
    }
    loop {
        if run % 2 == 1 {
            out.push(0); // RUNA
            run = (run - 1) / 2;
        } else {
            out.push(1); // RUNB
            run = (run - 2) / 2;
        }
        if run == 0 {
            break;
        }
    }
}

/// Inverse of `rle2_forward`. Given a stream of decoded symbols
/// (RUNA=0, RUNB=1, MTF+1=2..=alphabet_size, plus EOB at
/// alphabet_size+1), return the MTF index stream that produced it.
///
/// The EOB is **not** expected to be present in `symbols` — the decoder
/// strips it before handing us the rest.
pub(crate) fn rle2_inverse(symbols: &[u16]) -> Vec<u8> {
    let mut out = Vec::with_capacity(symbols.len());
    let mut i = 0;
    while i < symbols.len() {
        let s = symbols[i];
        if s <= 1 {
            // Decode a run of zeros. Read consecutive RUNA/RUNB symbols
            // and accumulate the value. The run length is:
            //   sum over k of contribution(symbol_k) × 2^k
            // where contribution(RUNA) = 1, contribution(RUNB) = 2.
            let mut run: u32 = 0;
            let mut weight: u32 = 1;
            while i < symbols.len() && symbols[i] <= 1 {
                let contrib = if symbols[i] == 0 { 1 } else { 2 };
                run = run.saturating_add(contrib * weight);
                weight = weight.saturating_mul(2);
                i += 1;
            }
            out.extend(core::iter::repeat_n(0u8, run as usize));
        } else {
            // s >= 2 means MTF index (s - 1) >= 1.
            out.push((s - 1) as u8);
            i += 1;
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    #[test]
    fn rle1_short_runs_passthrough() {
        let v = b"abcabc";
        assert_eq!(rle1_forward(v), v);
        assert_eq!(rle1_inverse(&rle1_forward(v)), v);
    }

    #[test]
    fn rle1_run_of_four_emits_extra() {
        // Run of 4 → 4 copies + 0.
        let v = b"aaaa";
        let r = rle1_forward(v);
        assert_eq!(r, vec![b'a', b'a', b'a', b'a', 0]);
        assert_eq!(rle1_inverse(&r), v);
    }

    #[test]
    fn rle1_run_of_seven() {
        let v = b"aaaaaaa";
        let r = rle1_forward(v);
        assert_eq!(r, vec![b'a', b'a', b'a', b'a', 3]);
        assert_eq!(rle1_inverse(&r), v);
    }

    #[test]
    fn rle1_long_run_capped() {
        // 256 'a's → 4 'a's + 251 (run-4) = 255, then a run of 1 'a'.
        // Wait: in our impl we cap at 255 so the output is one 4+251
        // token (covering 255 bytes) followed by a single 'a'.
        let v = vec![b'a'; 256];
        let r = rle1_forward(&v);
        // First five bytes are 'a','a','a','a',251 (covers 255 bytes),
        // then 'a' (the leftover one).
        assert_eq!(r.len(), 6);
        assert_eq!(r[..5], [b'a', b'a', b'a', b'a', 251]);
        assert_eq!(r[5], b'a');
        assert_eq!(rle1_inverse(&r), v);
    }

    /// Feed `input` through the streaming encoder one byte at a time
    /// and return the finished output.
    fn rle1_stream(input: &[u8]) -> Vec<u8> {
        let mut e = Rle1Encoder::new();
        for &b in input {
            e.push(b);
        }
        e.finish()
    }

    #[test]
    fn rle1_stream_matches_oneshot() {
        // The streaming encoder must produce byte-for-byte the same
        // output as the one-shot `rle1_forward` on a range of inputs,
        // including run-straddling and the 255-cap boundary.
        let cases: Vec<Vec<u8>> = vec![
            b"".to_vec(),
            b"a".to_vec(),
            b"abcabc".to_vec(),
            b"aaaa".to_vec(),
            b"aaaaaaa".to_vec(),
            vec![b'a'; 255],
            vec![b'a'; 256],
            vec![b'a'; 600],
            {
                let mut v = vec![b'x'; 300];
                v.extend(vec![b'y'; 5]);
                v.extend(b"zzz");
                v.extend(vec![b'q'; 1000]);
                v
            },
        ];
        for c in &cases {
            assert_eq!(rle1_stream(c), rle1_forward(c), "mismatch len {}", c.len());
        }
    }

    #[test]
    fn rle1_stream_encoded_len_tracks_output() {
        // `encoded_len()` must equal the final output length at finish,
        // including the pending run's contribution.
        for n in [0usize, 1, 3, 4, 5, 254, 255, 256, 700] {
            let input = vec![b'a'; n];
            let mut e = Rle1Encoder::new();
            for &b in &input {
                e.push(b);
            }
            let predicted = e.encoded_len();
            let out = e.finish();
            assert_eq!(predicted, out.len(), "encoded_len mismatch for n={n}");
            assert_eq!(out, rle1_forward(&input));
        }
    }

    #[test]
    fn rle2_round_trip() {
        // Use a synthetic MTF stream with zeros mixed in.
        let mtf = vec![0u8, 0, 0, 1, 2, 0, 0, 0, 0, 5, 0];
        let alphabet_size = 6;
        let sym = rle2_forward(&mtf, alphabet_size);
        let inv = rle2_inverse(&sym);
        assert_eq!(inv, mtf);
    }

    #[test]
    fn rle2_empty() {
        let sym = rle2_forward(&[], 4);
        assert!(sym.is_empty());
        assert!(rle2_inverse(&sym).is_empty());
    }
}