compcol 0.6.5

A no_std collection of compression algorithms behind a uniform streaming trait, gated per-algorithm by Cargo features.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
//! bzip2 streaming encoder.
//!
//! The encoder buffers input up to the configured block size, then
//! produces an encoded block on `raw_finish` (or whenever the buffer
//! fills). The pipeline per block is:
//!
//! ```text
//! raw bytes → RLE-1 → BWT → MTF → RLE-2 → multi-table Huffman → bitstream
//! ```
//!
//! The output is staged into a `Vec<u8>` that we drain into the
//! caller's output buffer. We emit:
//! - The 4-byte stream header `"BZh<level>"` exactly once, on the first
//!   `raw_encode`/`raw_finish` call.
//! - One block payload per filled or finished block.
//! - The stream footer (end-of-stream magic + combined CRC + byte-
//!   align padding) once `raw_finish` runs.

extern crate alloc;
use alloc::vec;
use alloc::vec::Vec;

use crate::error::Error;
use crate::traits::{RawEncoder, RawProgress};

use super::bits::BitWriter;
use super::bwt::bwt_forward;
use super::crc::Crc32;
use super::huffman::{MAX_CODE_LEN, build_canonical_codes, build_canonical_lengths};
use super::mtf::mtf_forward_reduced;
use super::rle::{Rle1Encoder, rle1_forward, rle2_forward};

/// Tunables for the bzip2 encoder.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct EncoderConfig {
    /// Block size in 100 KB units, 1..=9. Default 6 (matches reference
    /// bzip2). The post-RLE-1 buffer is capped at
    /// `level * 100_000 - 19` bytes.
    pub level: u8,
}

impl Default for EncoderConfig {
    fn default() -> Self {
        Self { level: 6 }
    }
}

/// Bzip2 stream/block magic numbers as 48-bit fields (high 48 bits of
/// the long-magic constants).
const BLOCK_MAGIC: u64 = 0x3141_5926_5359; // "1AY&SY"
const STREAM_END_MAGIC: u64 = 0x1772_4538_5090; // "sqrt(pi)"

/// Maximum number of Huffman selector groups per block. bzip2 splits
/// the symbol stream into 50-symbol groups and assigns each group one
/// of the (2..=6) Huffman tables. We cap the per-block selector count
/// at this value to bound encoder memory.
const MAX_SELECTORS: usize = 18002; // matches bzip2 reference

#[derive(Clone, Copy, PartialEq, Eq)]
enum Phase {
    /// Accepting raw input into `pending`. Header may or may not be
    /// flushed yet (header bytes live in `out` if not).
    Accepting,
    /// All pending input has been consumed, encoded blocks (if any)
    /// drained, and the stream footer written. `out` may still hold
    /// bytes to drain to the caller.
    Flushed,
    /// Stream footer fully drained.
    Done,
}

pub struct Encoder {
    config: EncoderConfig,
    /// Per-block raw input buffer. Retained verbatim because the block
    /// CRC is computed over the raw bytes; `encode_block` re-runs RLE-1
    /// over it. Block boundaries, however, are governed by the *post*-
    /// RLE-1 size tracked in `rle1` (matching reference bzip2's
    /// `nblock`-based blocking), not by `pending.len()`.
    pending: Vec<u8>,
    /// Streaming RLE-1 size tracker mirroring `pending`. Used only to
    /// decide when the current block has reached the post-RLE-1 cap.
    rle1: Rle1Encoder,
    /// Encoded bytes waiting to be returned to the caller.
    out: Vec<u8>,
    /// Index into `out` of the next byte to deliver.
    out_idx: usize,
    /// Whether the stream header has been queued into `out`.
    header_written: bool,
    /// Running combined CRC across all blocks so far.
    combined_crc: u32,
    /// In-flight bit accumulator used to splice the per-block bit
    /// streams into one contiguous bitstream. `None` before the
    /// header is written.
    bw: Option<BitWriter>,
    phase: Phase,
}

impl Encoder {
    pub fn new() -> Self {
        Self::with_config(EncoderConfig::default())
    }

    pub fn with_config(mut config: EncoderConfig) -> Self {
        // Clamp the level to 1..=9 (reference bzip2 behaviour).
        config.level = config.level.clamp(1, 9);
        Self {
            config,
            pending: Vec::new(),
            rle1: Rle1Encoder::new(),
            out: Vec::new(),
            out_idx: 0,
            header_written: false,
            combined_crc: 0,
            bw: None,
            phase: Phase::Accepting,
        }
    }

    /// Maximum number of post-RLE-1 bytes per block. We follow the
    /// reference upper bound `level * 100_000 - 19`; the
    /// `-19` cushions the worst-case expansion of pathological inputs
    /// passing through the post-MTF / Huffman layers.
    fn block_input_cap(&self) -> usize {
        (self.config.level as usize) * 100_000 - 19
    }

    /// Lazy-init the stream header bit accumulator.
    fn ensure_header(&mut self) {
        if self.header_written {
            return;
        }
        // "BZh" + ASCII level digit.
        self.out.push(b'B');
        self.out.push(b'Z');
        self.out.push(b'h');
        self.out.push(b'0' + self.config.level);
        self.bw = Some(BitWriter::new());
        self.header_written = true;
    }

    /// Encode the bytes currently buffered in `pending` into a single
    /// bzip2 block, appending its bit-stream to `self.bw`. Resets
    /// `pending` to empty afterwards.
    fn encode_block(&mut self) {
        let block: Vec<u8> = core::mem::take(&mut self.pending);
        // Reset the RLE-1 size tracker for the next block.
        self.rle1 = Rle1Encoder::new();
        // Sanity: a "no input" block is not produced — we only call
        // encode_block when pending is non-empty.
        debug_assert!(!block.is_empty());

        // Step 1: per-block CRC of the **raw** bytes (CRC-32/MPEG-2).
        let mut crc = Crc32::new();
        crc.update(&block);
        let block_crc = crc.value();

        // Update the running combined CRC: rotate-left then XOR.
        self.combined_crc = self.combined_crc.rotate_left(1) ^ block_crc;

        // Step 2: RLE-1.
        let rle1 = rle1_forward(&block);

        // Step 3: BWT.
        let (l_column, origin) = bwt_forward(&rle1);

        // Step 4: reduced alphabet of the bytes that appear in L. The
        // 16-byte stripe map encoding (in the header) needs this.
        let mut present = [false; 256];
        for &b in &l_column {
            present[b as usize] = true;
        }
        let alphabet: Vec<u8> = (0..=255u8).filter(|&b| present[b as usize]).collect();
        let num_used = alphabet.len();
        // bzip2 forbids the degenerate empty alphabet case; rle1 of a
        // non-empty input is non-empty, so num_used ≥ 1.

        // Step 5: MTF over the reduced alphabet.
        let mtf = mtf_forward_reduced(&l_column, &alphabet);
        // Free the BWT output before RLE-2 — it can be large at the
        // 900 KB block size.
        drop(l_column);

        // Step 6: RLE-2. The output uses symbols 0..=num_used + 1
        // (0/1 = RUNA/RUNB, 2..=num_used = MTF idx 1..=num_used-1,
        // num_used + 1 = EOB). We append the EOB after the last RLE-2
        // symbol so the decoder has an unambiguous stop signal.
        let mut symbols = rle2_forward(&mtf, num_used);
        let eob_symbol: u16 = (num_used as u16) + 1;
        symbols.push(eob_symbol);
        drop(mtf);

        let alpha_size = num_used + 2; // includes EOB
        // Step 7: choose number of Huffman tables. bzip2 chooses
        // 2..=6 based on the per-block symbol-count buckets.
        let num_tables = pick_num_tables(symbols.len());

        // Step 8: build the multi-table Huffman assignment exactly the
        // way reference bzip2's `sendMTFValues` does: initialise the
        // tables by partitioning the cumulative-frequency space, then
        // run a fixed number of refinement passes that (a) assign each
        // 50-symbol group to whichever table codes it cheapest and
        // (b) rebuild each table from the symbols of the groups
        // assigned to it. This is where the compression edge over a
        // single shared table comes from.
        let num_selectors_total = symbols.len().div_ceil(50);
        debug_assert!(num_selectors_total >= 1);
        debug_assert!(num_selectors_total <= MAX_SELECTORS);

        let (tables, selectors) =
            optimize_tables(&symbols, alpha_size, num_tables, num_selectors_total);

        // Build canonical codes for each table.
        let codes: Vec<Vec<u32>> = tables
            .iter()
            .map(|lens| build_canonical_codes(lens))
            .collect();

        // ─── Now write the block to self.bw ──────────────────────
        let bw = self.bw.as_mut().expect("header must be written first");

        // Block magic (48 bits).
        bw.write_bits_48(BLOCK_MAGIC);
        // Per-block CRC (32 bits, MSB-first).
        bw.write_bits(32, block_crc);
        // Randomized flag (1 bit) — always 0 for modern bzip2.
        bw.write_bit(0);
        // BWT origin (24 bits).
        bw.write_bits(24, origin);

        // Symbol map: 16 bits saying which 16-byte stripes have any
        // present symbol, then for each used stripe a 16-bit mask.
        let mut stripe_used = [false; 16];
        for &b in &alphabet {
            stripe_used[(b >> 4) as usize] = true;
        }
        let mut stripe_top: u16 = 0;
        for (i, &u) in stripe_used.iter().enumerate() {
            if u {
                stripe_top |= 1 << (15 - i);
            }
        }
        bw.write_bits(16, stripe_top as u32);
        for (stripe, &used) in stripe_used.iter().enumerate() {
            if !used {
                continue;
            }
            let mut mask: u16 = 0;
            for byte in 0..16 {
                let candidate = (stripe << 4) | byte;
                if present[candidate] {
                    mask |= 1 << (15 - byte);
                }
            }
            bw.write_bits(16, mask as u32);
        }

        // Number of Huffman tables (3 bits, 2..=6).
        bw.write_bits(3, num_tables as u32);
        // Number of selectors (15 bits, ≥1).
        bw.write_bits(15, num_selectors_total as u32);

        // MTF-encoded selector list. Each selector is encoded under a
        // local MTF over 0..num_tables: unary prefix of N zeros to
        // pick the value at MTF position N, followed by a 0 stop bit.
        // For num_tables=2 and all selectors=0, the MTF position is
        // always 0 → each selector is just a single "0" bit.
        // We implement the general MTF-then-unary scheme anyway.
        let mut mtf_list: Vec<u8> = (0..num_tables as u8).collect();
        for &sel in &selectors {
            // Find sel's position in mtf_list.
            let mut pos = 0;
            while mtf_list[pos] != sel {
                pos += 1;
            }
            // Emit `pos` 1-bits then a 0-bit (unary stop-coded).
            for _ in 0..pos {
                bw.write_bit(1);
            }
            bw.write_bit(0);
            // Move sel to the front of mtf_list.
            if pos > 0 {
                let v = mtf_list.remove(pos);
                mtf_list.insert(0, v);
            }
        }

        // Per-table code lengths: 5-bit start length + delta-coded
        // changes (10 = +1, 11 = -1, stop on 0).
        for table in &tables {
            let mut cur = table[0] as i32;
            bw.write_bits(5, cur as u32);
            for &l in table.iter() {
                let target = l as i32;
                while cur != target {
                    if target > cur {
                        // 10 = +1
                        bw.write_bit(1);
                        bw.write_bit(0);
                        cur += 1;
                    } else {
                        // 11 = -1
                        bw.write_bit(1);
                        bw.write_bit(1);
                        cur -= 1;
                    }
                }
                // 0 = "stop, this symbol's length is cur".
                bw.write_bit(0);
            }
        }

        // The post-MTF symbol stream, group by group, using the
        // selector for each group's table.
        let mut group_idx = 0usize;
        let mut i = 0usize;
        while i < symbols.len() {
            let end = (i + 50).min(symbols.len());
            let sel = selectors[group_idx] as usize;
            let lens = &tables[sel];
            let cds = &codes[sel];
            for &s in &symbols[i..end] {
                let len = lens[s as usize] as u32;
                let code = cds[s as usize];
                bw.write_bits(len, code);
            }
            group_idx += 1;
            i = end;
        }
        // tables, codes, selectors no longer needed; let the borrow
        // end before we move on to the next block.
        drop(tables);
        drop(codes);
        drop(selectors);
    }

    /// Drain currently-pending bytes from `self.out` into `output`.
    fn drain_out(&mut self, output: &mut [u8], written: &mut usize) {
        let avail = self.out.len() - self.out_idx;
        let space = output.len() - *written;
        let n = avail.min(space);
        if n > 0 {
            output[*written..*written + n]
                .copy_from_slice(&self.out[self.out_idx..self.out_idx + n]);
            self.out_idx += n;
            *written += n;
        }
        if self.out_idx == self.out.len() {
            self.out.clear();
            self.out_idx = 0;
        }
    }

    /// Flush the in-progress bit writer's whole-byte content into
    /// `self.out` (the writer keeps any partial byte still buffered).
    fn flush_full_bytes(&mut self) {
        if let Some(ref mut bw) = self.bw {
            // The BitWriter's API exposes `into_bytes` (consuming) and
            // `align_to_byte` — neither is "give me the complete bytes
            // and keep the partial byte". We work around this by
            // taking the writer out, splitting the assembled bytes,
            // and reinstalling a fresh writer with the partial bits
            // restored.
            //
            // Practical detail: between blocks bzip2 streams may be at
            // an unaligned boundary, so we MUST preserve the partial
            // byte across calls. We achieve this by always emitting
            // bits eagerly into `bw` (BitWriter pushes a whole byte to
            // its internal Vec as soon as 8 bits accumulate) and
            // periodically extracting those whole bytes into `out`.
            let taken = core::mem::replace(bw, BitWriter::new());
            // Split the internal vector from the trailing partial byte
            // by going through the writer's internals — which we
            // expose via a dedicated helper.
            let (bytes, cur, nbits) = bw_internals(taken);
            self.out.extend_from_slice(&bytes);
            *bw = bw_rehydrate(cur, nbits);
        }
    }

    /// Write the stream footer (end-of-stream magic + combined CRC + byte align).
    fn write_footer(&mut self) {
        let bw = self.bw.as_mut().expect("header must be written first");
        bw.write_bits_48(STREAM_END_MAGIC);
        bw.write_bits(32, self.combined_crc);
        bw.align_to_byte();
        let taken = core::mem::replace(bw, BitWriter::new());
        let (bytes, _, _) = bw_internals(taken);
        self.out.extend_from_slice(&bytes);
        self.bw = None;
    }
}

// ─── BitWriter internals access helpers ──────────────────────────────────
//
// These let us peek into the writer to extract its complete-byte buffer
// without losing the partial-byte state. We keep them tightly scoped to
// this module so the BitWriter remains a regular type elsewhere.

fn bw_internals(bw: BitWriter) -> (Vec<u8>, u8, u8) {
    bw.internals_for_encoder()
}

fn bw_rehydrate(cur: u8, nbits: u8) -> BitWriter {
    BitWriter::rehydrate(cur, nbits)
}

// ─── Huffman table count chooser ────────────────────────────────────────

/// Pick the number of Huffman tables (2..=6) for a block with
/// `n_symbols` post-RLE-2 symbols (including the EOB marker).
///
/// Reference bzip2 uses: 2 if n<200, 3 if n<600, 4 if n<1200, 5 if
/// n<2400, else 6. We mirror that.
fn pick_num_tables(n_symbols: usize) -> usize {
    if n_symbols < 200 {
        2
    } else if n_symbols < 600 {
        3
    } else if n_symbols < 1200 {
        4
    } else if n_symbols < 2400 {
        5
    } else {
        6
    }
}

/// Number of refinement passes over the table/selector assignment.
/// Reference bzip2 uses `BZ_N_ITERS = 4`.
const HUFF_N_ITERS: usize = 4;

/// Symbol-group size used when assigning selectors. Reference bzip2
/// uses `BZ_G_SIZE = 50`.
const HUFF_GROUP_SIZE: usize = 50;

/// A code length placeholder used during table initialisation/cost
/// scoring. Mirrors reference bzip2's `BZ_LESSER_ICOST` (0) and
/// `BZ_GREATER_ICOST` (15).
const ICOST_LESSER: u8 = 0;
const ICOST_GREATER: u8 = 15;

/// Build `num_tables` Huffman code-length tables and a per-group
/// selector list using reference bzip2's `sendMTFValues` strategy.
///
/// Returns `(tables, selectors)` where `tables[t]` is the per-symbol
/// code-length array (length `alpha_size`) for table `t`, and
/// `selectors[g]` is the table id (0..num_tables) chosen for the g-th
/// 50-symbol group.
fn optimize_tables(
    symbols: &[u16],
    alpha_size: usize,
    num_tables: usize,
    num_groups: usize,
) -> (Vec<Vec<u8>>, Vec<u8>) {
    // Global symbol frequencies across the whole block.
    let mut global_freq = vec![0u32; alpha_size];
    for &s in symbols {
        global_freq[s as usize] += 1;
    }

    // ── Initial table construction ────────────────────────────────
    //
    // Faithful port of reference bzip2's `sendMTFValues` initialiser:
    // partition the alphabet into `num_tables` contiguous bands of
    // (roughly) equal total frequency, walking symbols low→high. Band
    // `nPart-1` (i.e. tables fill from the highest id down to 0) covers
    // the next slice of low symbols; in-band symbols get the cheap
    // placeholder length, the rest the expensive one. There is an
    // odd-iteration back-off that nudges the band boundary, matching
    // the reference exactly so our initial assignment — and therefore
    // the refinement that follows — tracks bzip2's.
    let mut tables: Vec<Vec<u8>> = (0..num_tables)
        .map(|_| vec![ICOST_GREATER; alpha_size])
        .collect();
    {
        let n_mtf = symbols.len() as i64;
        let mut n_part = num_tables as i64;
        let mut rem_f = n_mtf;
        let mut gs = 0i64;
        while n_part > 0 {
            let t_freq = rem_f / n_part;
            let mut ge = gs - 1;
            let mut a_freq = 0i64;
            while a_freq < t_freq && ge < alpha_size as i64 - 1 {
                ge += 1;
                a_freq += global_freq[ge as usize] as i64;
            }
            // Odd-iteration back-off: if this isn't the first part, the
            // boundary lands above `gs`, the part index parity is odd,
            // and backing off keeps `a_freq` closer to the target.
            if ge > gs
                && n_part != num_tables as i64
                && n_part != 1
                && ((num_tables as i64 - n_part) % 2 == 1)
            {
                a_freq -= global_freq[ge as usize] as i64;
                ge -= 1;
            }

            let lens = &mut tables[(n_part - 1) as usize];
            for (v, slot) in lens.iter_mut().enumerate() {
                let vi = v as i64;
                if vi >= gs && vi <= ge {
                    *slot = ICOST_LESSER;
                } else {
                    *slot = ICOST_GREATER;
                }
            }

            n_part -= 1;
            gs = ge + 1;
            rem_f -= a_freq;
        }
    }

    let mut selectors = vec![0u8; num_groups];

    // ── Refinement passes ─────────────────────────────────────────
    for _iter in 0..HUFF_N_ITERS {
        // Per-table accumulated frequencies for this pass.
        let mut table_freq: Vec<Vec<u32>> = vec![vec![0u32; alpha_size]; num_tables];

        // For each group: score it under every table, pick the cheapest,
        // record the selector, and fold the group's symbols into the
        // winning table's frequency accumulator.
        let mut g = 0usize;
        let mut group_idx = 0usize;
        while g < symbols.len() {
            let end = (g + HUFF_GROUP_SIZE).min(symbols.len());
            let group = &symbols[g..end];

            // Cost of coding this group under each table.
            let mut best_table = 0usize;
            let mut best_cost = u64::MAX;
            for (t, lens) in tables.iter().enumerate() {
                let mut cost = 0u64;
                for &s in group {
                    cost += lens[s as usize] as u64;
                }
                if cost < best_cost {
                    best_cost = cost;
                    best_table = t;
                }
            }

            selectors[group_idx] = best_table as u8;
            let acc = &mut table_freq[best_table];
            for &s in group {
                acc[s as usize] += 1;
            }

            group_idx += 1;
            g = end;
        }

        // Rebuild each table from the frequencies of the groups assigned
        // to it. A table with no assigned groups keeps coverage via the
        // `+1` floor inside `build_canonical_lengths`.
        for (t, freq) in table_freq.iter().enumerate() {
            tables[t] = build_canonical_lengths(freq, MAX_CODE_LEN);
        }
    }

    (tables, selectors)
}

impl Default for Encoder {
    fn default() -> Self {
        Self::new()
    }
}

impl RawEncoder for Encoder {
    fn raw_encode(&mut self, input: &[u8], output: &mut [u8]) -> Result<RawProgress, Error> {
        if matches!(self.phase, Phase::Done | Phase::Flushed) {
            return Ok(RawProgress {
                consumed: 0,
                written: 0,
                done: false,
            });
        }
        self.ensure_header();

        let cap = self.block_input_cap();
        let mut consumed = 0usize;
        let mut written = 0usize;

        // First drain any already-encoded bytes that didn't fit on the
        // previous call.
        self.drain_out(output, &mut written);
        if written == output.len() && !self.out.is_empty() {
            return Ok(RawProgress {
                consumed,
                written,
                done: false,
            });
        }

        // Now accept input, filling each block up to the per-block
        // *post-RLE-1* cap (reference bzip2 sizes blocks by `nblock`,
        // the RLE-1 output length). We feed bytes through both the raw
        // buffer (kept for the CRC and the in-block RLE-1 re-run) and
        // the streaming size tracker, cutting a block the moment the
        // tracked RLE-1 length reaches the cap. When a block fills we
        // encode it, drain to output, and continue.
        while consumed < input.len() {
            let b = input[consumed];
            self.pending.push(b);
            self.rle1.push(b);
            consumed += 1;

            if self.rle1.encoded_len() >= cap {
                self.encode_block();
                self.flush_full_bytes();
                self.drain_out(output, &mut written);
                if written == output.len() && !self.out.is_empty() {
                    // Pending output didn't fit — caller must drain.
                    return Ok(RawProgress {
                        consumed,
                        written,
                        done: false,
                    });
                }
            }
        }

        Ok(RawProgress {
            consumed,
            written,
            done: false,
        })
    }

    fn raw_finish(&mut self, output: &mut [u8]) -> Result<RawProgress, Error> {
        let mut written = 0usize;
        self.ensure_header();

        // First: drain any already-encoded bytes from prior calls.
        self.drain_out(output, &mut written);
        if !self.out.is_empty() {
            return Ok(RawProgress {
                consumed: 0,
                written,
                done: false,
            });
        }

        // Encode any pending bytes into a final block.
        if matches!(self.phase, Phase::Accepting) {
            if !self.pending.is_empty() {
                self.encode_block();
                self.flush_full_bytes();
            }
            self.write_footer();
            self.phase = Phase::Flushed;
            self.drain_out(output, &mut written);
            if !self.out.is_empty() {
                return Ok(RawProgress {
                    consumed: 0,
                    written,
                    done: false,
                });
            }
        }

        // Anything left to drain?
        if matches!(self.phase, Phase::Flushed) {
            self.drain_out(output, &mut written);
            if self.out.is_empty() {
                self.phase = Phase::Done;
            }
        }

        Ok(RawProgress {
            consumed: 0,
            written,
            done: matches!(self.phase, Phase::Done),
        })
    }

    fn raw_reset(&mut self) {
        self.pending.clear();
        self.rle1 = Rle1Encoder::new();
        self.out.clear();
        self.out_idx = 0;
        self.header_written = false;
        self.combined_crc = 0;
        self.bw = None;
        self.phase = Phase::Accepting;
    }
}