structured-zstd 0.0.26

Pure Rust zstd implementation — managed fork of ruzstd. Dictionary decompression, no FFI.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
//! This module contains the decompress_literals function, used to take a
//! parsed literals header and a source and decompress it.

use super::super::blocks::literals_section::{LiteralsSection, LiteralsSectionType};
use super::scratch::HuffmanScratch;
use crate::bit_io::BitReaderReversed;
#[cfg(target_arch = "x86_64")]
use crate::cpu_kernel::{Avx2Kernel, Bmi2Kernel, CpuKernelTag, Vbmi2Kernel};
use crate::cpu_kernel::{CpuKernel, ScalarKernel, detect_cpu_kernel};
use crate::decoding::errors::DecompressLiteralsError;
use crate::huff0::HuffmanDecoder;
use alloc::vec::Vec;

/// Decode and decompress the provided literals section into `target`, returning the number of bytes read.
/// Test-only Vec-output wrapper retained for the existing roundtrip
/// test suite, which asserts the literal byte stream lands fully
/// in a Vec. Production callers use [`decode_literals_zerocopy`].
#[cfg(test)]
pub fn decode_literals(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &[u8],
    target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
    match section.ls_type {
        LiteralsSectionType::Raw => {
            target.extend(&source[0..section.regenerated_size as usize]);
            Ok(section.regenerated_size)
        }
        LiteralsSectionType::RLE => {
            target.resize(target.len() + section.regenerated_size as usize, source[0]);
            Ok(1)
        }
        LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
            let bytes_read = decompress_literals(section, scratch, source, target)?;
            Ok(bytes_read)
        }
    }
}

/// Result of [`decode_literals_zerocopy`]. For Raw sections this is a
/// borrow straight into the input — no memcpy. For RLE / HUF
/// sections it's a borrow of the scratch `literals_buffer` where the
/// data was materialised.
pub struct LiteralsView<'a> {
    /// Decoded literal bytes available for the sequence executor.
    pub data: &'a [u8],
    /// Bytes consumed from the input literals section payload
    /// (Raw: regenerated_size; HUF: header + jump + 4 streams).
    pub bytes_used: u32,
}

/// Zero-copy variant of [`decode_literals`]. For Raw literal sections
/// returns a slice straight into `source` instead of copying bytes
/// into a Vec — eliminates one memcpy + one zero-touch wave per RAW
/// literal byte on the direct-decode path. RLE / HUF paths still go
/// through `target` because they have to produce new bytes (RLE: N
/// copies of one byte; HUF: indexed burst writes).
///
/// Donor parity: `dctx->litPtr` is set to either `src` (Raw) or
/// `dctx->litBuffer` (HUF); the seq executor reads from
/// `dctx->litPtr` uniformly.
pub fn decode_literals_zerocopy<'a>(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &'a [u8],
    target: &'a mut Vec<u8>,
) -> Result<LiteralsView<'a>, DecompressLiteralsError> {
    // Snapshot `target.len()` before any decode work — the returned
    // view must point ONLY at the newly-decoded bytes, not at any
    // pre-existing tail the caller forgot to `clear()`. The current
    // in-tree callers clear before this call, but anchoring the
    // view at `base..` makes the API robust against future
    // misuse and matches donor's `dctx->litPtr` semantics (always
    // points at the current frame's literals, never carries
    // history from earlier blocks' Vecs).
    let base = target.len();
    match section.ls_type {
        LiteralsSectionType::Raw => {
            let n = section.regenerated_size as usize;
            // Bounds check: a truncated frame can claim more raw
            // literals than the source slice carries. Return a
            // structured error instead of panicking on `source[0..n]`.
            if source.len() < n {
                return Err(DecompressLiteralsError::MissingBytesForLiterals {
                    got: source.len(),
                    needed: n,
                });
            }
            // Zero-copy: borrow the payload from source. `target` is
            // left untouched — the caller passes `LiteralsView::data`
            // to the sequence executor instead.
            Ok(LiteralsView {
                data: &source[0..n],
                bytes_used: section.regenerated_size,
            })
        }
        LiteralsSectionType::RLE => {
            // RLE expands one byte to N — has to write into target.
            // Need at least one source byte (the fill byte).
            if source.is_empty() {
                return Err(DecompressLiteralsError::MissingBytesForLiterals { got: 0, needed: 1 });
            }
            target.resize(base + section.regenerated_size as usize, source[0]);
            Ok(LiteralsView {
                data: &target[base..],
                bytes_used: 1,
            })
        }
        LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
            let bytes_used = decompress_literals(section, scratch, source, target)?;
            Ok(LiteralsView {
                data: &target[base..],
                bytes_used,
            })
        }
    }
}

/// Decompress the provided literals section and source into the provided `target`.
/// This function is used when the literals section is `Compressed` or `Treeless`
///
/// Returns the number of bytes read.
fn decompress_literals(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &[u8],
    target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
    // Per-block CpuKernel dispatch. `detect_cpu_kernel()` resolves the
    // tag at most once per process: under `feature = "std"` via an
    // `OnceLock` cache around `is_x86_feature_detected!`, and under
    // `no_std` it is a `cfg(target_feature = ...)` const at compile
    // time. Either way the match below collapses to a single cmp+jmp
    // on subsequent calls (or to a single arm at codegen on no-std).
    // Each arm dispatches into a target_feature-wrapped outer function
    // so the entire impl::<K> pipeline executes inside the matching
    // target_feature context — without that wrapping, LLVM cannot
    // inline target_feature'd intrinsics (e.g. _bzhi_u64 inside
    // K::mask_lower_bits) through the trait-method call boundary back
    // into the generic caller, and the inlined-intrinsic win
    // evaporates into a function-call trampoline per mask op.
    match detect_cpu_kernel() {
        #[cfg(target_arch = "x86_64")]
        CpuKernelTag::Vbmi2 => unsafe {
            decompress_literals_vbmi2(section, scratch, source, target)
        },
        #[cfg(target_arch = "x86_64")]
        CpuKernelTag::Avx2 => unsafe { decompress_literals_avx2(section, scratch, source, target) },
        #[cfg(target_arch = "x86_64")]
        CpuKernelTag::Bmi2 => unsafe { decompress_literals_bmi2(section, scratch, source, target) },
        _ => decompress_literals_impl::<ScalarKernel>(section, scratch, source, target),
    }
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2,avx2")]
unsafe fn decompress_literals_avx2(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &[u8],
    target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
    decompress_literals_impl::<Avx2Kernel>(section, scratch, source, target)
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
unsafe fn decompress_literals_bmi2(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &[u8],
    target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
    decompress_literals_impl::<Bmi2Kernel>(section, scratch, source, target)
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512vbmi2,avx512f,avx512vl,avx512bw,bmi2,avx2")]
unsafe fn decompress_literals_vbmi2(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &[u8],
    target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
    decompress_literals_impl::<Vbmi2Kernel>(section, scratch, source, target)
}

fn decompress_literals_impl<K: CpuKernel>(
    section: &LiteralsSection,
    scratch: &mut HuffmanScratch,
    source: &[u8],
    target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
    use DecompressLiteralsError as err;

    let compressed_size = section.compressed_size.ok_or(err::MissingCompressedSize)? as usize;
    let num_streams = section.num_streams.ok_or(err::MissingNumStreams)?;
    let base = target.len();
    let regen = section.regenerated_size as usize;

    target.reserve(regen);
    let source = &source[0..compressed_size];
    let mut bytes_read = 0;

    match section.ls_type {
        LiteralsSectionType::Compressed => {
            //read Huffman tree description
            bytes_read += scratch.table.build_decoder(source)?;
            vprintln!("Built huffman table using {} bytes", bytes_read);
        }
        LiteralsSectionType::Treeless if scratch.table.max_num_bits == 0 => {
            return Err(err::UninitializedHuffmanTable);
        }

        _ => { /* nothing to do, huffman tree has been provided by previous block */ }
    }

    let source = &source[bytes_read as usize..];

    if num_streams == 4 {
        //build jumptable
        if source.len() < 6 {
            return Err(err::MissingBytesForJumpHeader { got: source.len() });
        }
        let jump1 = source[0] as usize + ((source[1] as usize) << 8);
        let jump2 = jump1 + source[2] as usize + ((source[3] as usize) << 8);
        let jump3 = jump2 + source[4] as usize + ((source[5] as usize) << 8);
        bytes_read += 6;
        let source = &source[6..];

        if source.len() < jump3 {
            return Err(err::MissingBytesForLiterals {
                got: source.len(),
                needed: jump3,
            });
        }

        //decode 4 streams with interleaved operations to hide memory latency
        let streams: [&[u8]; 4] = [
            &source[..jump1],
            &source[jump1..jump2],
            &source[jump2..jump3],
            &source[jump3..],
        ];

        let mut decoders: [HuffmanDecoder<'_>; 4] = [
            HuffmanDecoder::new(&scratch.table),
            HuffmanDecoder::new(&scratch.table),
            HuffmanDecoder::new(&scratch.table),
            HuffmanDecoder::new(&scratch.table),
        ];
        let mut brs: [BitReaderReversed<'_, K>; 4] = [
            BitReaderReversed::<K>::new(streams[0]),
            BitReaderReversed::<K>::new(streams[1]),
            BitReaderReversed::<K>::new(streams[2]),
            BitReaderReversed::<K>::new(streams[3]),
        ];

        // Initialize all 4 streams: skip padding and set initial state
        for i in 0..4 {
            let mut skipped_bits = 0;
            loop {
                let val = brs[i].get_bits(1);
                skipped_bits += 1;
                if val == 1 || skipped_bits > 8 {
                    break;
                }
            }
            if skipped_bits > 8 {
                return Err(DecompressLiteralsError::ExtraPadding { skipped_bits });
            }
            decoders[i].init_state(&mut brs[i]);
        }

        let max_bits = scratch.table.max_num_bits as isize;

        // RFC 8878 §3.1.1.3.2: first 3 streams produce ceil(regen_size/4)
        // symbols each, 4th produces the remainder. Pre-allocate target and
        // decode directly into slices — no temporary Vec allocations.
        let seg = regen.div_ceil(4);

        target.resize(base + regen, 0);
        // Clamp every start/end into [base, base+regen] so cursors can
        // never index past the pre-allocated region, even with corrupted
        // frame headers that produce small regen (where N*seg > regen).
        let limit = base + regen;
        let starts: [usize; 4] = [
            base,
            (base + seg).min(limit),
            (base + 2 * seg).min(limit),
            (base + 3 * seg).min(limit),
        ];
        let ends: [usize; 4] = [starts[1], starts[2], starts[3], limit];
        let mut cursors = starts;

        // Donor-parity 4-stream HUF decode. `bits[s]` is the fused
        // state+stream+sentinel u64 register (see `run_4stream_burst_loop`).
        // Each iter decodes `symbols_per_burst` symbols × 4 streams,
        // then reloads all 4 stream registers via `ip[s] -= nb_bytes;
        // MEM_read64(ip[s]) | 1`.
        let max_num_bits = scratch.table.max_num_bits;
        // Safety constraint per donor `HUF_decompress4X1_usingDTable_internal_fast_c_loop`:
        // before each `bits[s] >> table_shift` read, the sentinel-bit position
        // must be strictly below bit `64 - max_num_bits` (i.e. outside the top
        // `max_num_bits` read region). After `s` shifts the sentinel is at bit
        // `padding_skip + s*max_num_bits`. The N-th read happens after (N-1)
        // shifts, so the inclusive bound is
        //   padding_skip + (N-1)*max_num_bits < 64 - max_num_bits
        // i.e.
        //   padding_skip + N*max_num_bits <= 63
        // Solving for N with padding_skip ≤ 8:
        //   N <= (63 - 8) / max_num_bits = 55 / max_num_bits
        // (Letter `s` is used here for shift-count to avoid colliding with
        // the surrounding generic parameter `K: CpuKernel`.)
        // For max=11: 5 symbols (donor parity — was 4 with the old off-by-one
        // formula). For max=8: 6 symbols. For max=4: 13.
        let symbols_per_burst: usize = (63 - 8) / max_num_bits as usize;
        let burst_bits = (symbols_per_burst * max_num_bits as usize) as u8;
        let table_shift = (64 - max_num_bits) as u32;
        let packed = scratch.table.packed_decode.as_slice();

        // Lockstep cursor invariant: every burst iter advances all 4
        // cursors by `symbols_per_burst` in step, so `cursors[0]`
        // tracks progress for all four streams. `cursor_exit_olimit
        // = starts[0] + min(seg_len[i])` is the cursor value at which
        // the lagging segment runs out — donor parity with
        // `huf_decompress.c` `olimit`-style single-pointer bound.
        let min_seg_len = (ends[0] - starts[0])
            .min(ends[1] - starts[1])
            .min(ends[2] - starts[2])
            .min(ends[3] - starts[3]);
        // `burst_eligible` is a load-bearing safety gate against
        // adversarial frame headers. If `min_seg_len < symbols_per_burst`
        // (small `regenerated_size` paired with large compressed
        // streams, forging a 4-stream HUF block where
        // `seg = div_ceil(regen, 4) < symbols_per_burst`) then
        // `cursor_burst_ceil` saturates to 0 and `cursors[0] <= 0`
        // is trivially true on entry, admitting a burst whose inner
        // loop would advance `cursors[i]` past `ends[i]` and panic
        // on the `target[cursors[i]]` write. Requiring
        // `min_seg_len >= symbols_per_burst` up front means the
        // burst only runs when a full burst fits inside EVERY
        // segment; the drain phase outside `run_4stream_burst_loop`
        // handles the small-`min_seg_len` case via single-symbol
        // per-stream decode.
        let burst_eligible = symbols_per_burst >= 1 && min_seg_len >= symbols_per_burst;
        let cursor_burst_ceil = (starts[0] + min_seg_len).saturating_sub(symbols_per_burst);

        let bounds = LoopBounds {
            symbols_per_burst,
            burst_bits,
            table_shift,
            cursor_burst_ceil,
            burst_eligible,
        };

        // Burst is identical across all kernels (donor parity: reads
        // `packed[idx]` u16 directly + `MEM_read64` reload pattern,
        // no SIMD intrinsics needed). Single un-genericised call.
        //
        // SAFETY: caller guarantees `brs[s].source` is the same as the
        // stream slice each decoder was initialised against; the
        // upfront `target.resize(base + regen, 0)` covers all cursor
        // writes; `packed` length matches `1 << max_num_bits` by
        // `HuffmanTable::build_decoder`'s `resize`.
        unsafe {
            run_4stream_burst_loop(
                &mut decoders,
                &mut brs,
                target,
                packed,
                &mut cursors,
                &bounds,
            );
        }

        // Drain remaining symbols from each stream, bounded by segment end
        for i in 0..4 {
            while brs[i].bits_remaining() > -max_bits && cursors[i] < ends[i] {
                target[cursors[i]] = decoders[i].decode_symbol_and_advance(&mut brs[i]);
                cursors[i] += 1;
            }
            if brs[i].bits_remaining() != -max_bits {
                target.truncate(base);
                return Err(DecompressLiteralsError::BitstreamReadMismatch {
                    read_til: brs[i].bits_remaining(),
                    expected: -max_bits,
                });
            }
        }

        // Verify total decoded count matches expected regenerated size.
        // Return error immediately rather than deferring to the downstream check.
        let decoded: usize = cursors.iter().zip(starts.iter()).map(|(c, s)| c - s).sum();
        if decoded != regen {
            // Truncate to base: segmented layout means partial decode left
            // bytes scattered across segments, so only base is a clean boundary.
            target.truncate(base);
            return Err(DecompressLiteralsError::DecodedLiteralCountMismatch {
                decoded,
                expected: regen,
            });
        }

        bytes_read += source.len() as u32;
    } else {
        //just decode the one stream
        assert!(num_streams == 1);
        let mut decoder = HuffmanDecoder::new(&scratch.table);
        let mut br = BitReaderReversed::<K>::new(source);
        let mut skipped_bits = 0;
        loop {
            let val = br.get_bits(1);
            skipped_bits += 1;
            if val == 1 || skipped_bits > 8 {
                break;
            }
        }
        if skipped_bits > 8 {
            //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
            return Err(DecompressLiteralsError::ExtraPadding { skipped_bits });
        }
        decoder.init_state(&mut br);
        while br.bits_remaining() > -(scratch.table.max_num_bits as isize) {
            target.push(decoder.decode_symbol_and_advance(&mut br));
        }
        let expected = -(scratch.table.max_num_bits as isize);
        if br.bits_remaining() != expected {
            target.truncate(base);
            return Err(DecompressLiteralsError::BitstreamReadMismatch {
                read_til: br.bits_remaining(),
                expected,
            });
        }
        bytes_read += source.len() as u32;
    }

    if target.len() != base + regen {
        let decoded = target.len() - base;
        target.truncate(base);
        return Err(DecompressLiteralsError::DecodedLiteralCountMismatch {
            decoded,
            expected: regen,
        });
    }

    Ok(bytes_read)
}

/// Loop-invariant constants for [`run_4stream_burst_loop`]. Derived
/// once per `decompress_literals` call; `Copy` so the burst can
/// destructure `*bounds` for register-resident reads.
#[derive(Copy, Clone)]
struct LoopBounds {
    symbols_per_burst: usize,
    burst_bits: u8,
    table_shift: u32,
    cursor_burst_ceil: usize,
    /// Set iff a full burst (`symbols_per_burst` symbols per stream)
    /// can fit in the lagging segment. When false the burst is
    /// hard-disabled and the drain phase outside the burst loop
    /// decodes ALL symbols via the single-symbol path. Setup-site
    /// safety rationale: adversarial / small-regen DoS guard.
    burst_eligible: bool,
}

/// Donor-parity 4-stream HUF decode burst loop. Single code path —
/// no kernel dispatch, no SIMD-fallback hybrid. Mirrors
/// `huf_decompress.c:HUF_decompress4X1_usingDTable_internal_fast_c_loop`:
/// each outer iter decodes `symbols_per_burst` symbols × 4 streams,
/// then reloads all 4 stream registers from raw source bytes via the
/// `ctz(bits[s])` → `ip[s] -= nb_bytes` → `MEM_read64(ip[s])` pattern.
///
/// State + unconsumed stream + sentinel are fused into one u64
/// per stream (`bits[s]`). The decoder's separate `state` field is
/// reconstructed once at burst exit for the drain phase below.
///
/// # Safety
///
/// All four decoders must share the same table (holds by construction —
/// built from `&scratch.table`). `target.len() >= base + regen`. Each
/// `brs[s].source` must be the slice the corresponding decoder was
/// initialised against.
#[inline(always)]
unsafe fn run_4stream_burst_loop<K: CpuKernel>(
    decoders: &mut [HuffmanDecoder<'_>; 4],
    brs: &mut [BitReaderReversed<'_, K>; 4],
    target: &mut [u8],
    packed: &[u16],
    cursors: &mut [usize; 4],
    bounds: &LoopBounds,
) {
    let LoopBounds {
        symbols_per_burst,
        burst_bits,
        table_shift,
        cursor_burst_ceil,
        burst_eligible,
    } = *bounds;
    let max_num_bits = (64 - table_shift) as u8;

    // Skip burst entirely if min_seg_len < symbols_per_burst — drain
    // (the single-symbol tail outside this function) handles ALL
    // symbols. See the `burst_eligible` doc on `LoopBounds`.
    if !burst_eligible {
        return;
    }

    // Donor-parity burst loop. `bits[s]` is the unified u64 register
    // that fuses state + unconsumed stream + sentinel:
    //   bits 63..(64-max_num_bits): current state (next index into `packed`)
    //   below:                       upcoming stream bits, top-aligned
    //   bottom:                      sentinel `1`, position grows upward
    //                                with each consumed bit
    //
    // The encoder side of HUF writes the bitstream backward such that
    // at every byte boundary the top `max_num_bits` of unconsumed
    // stream = current state. So state is implicit in `bits[s]`; we
    // do NOT carry a separate `decoder.state` inside the burst — it
    // is reconstructed via `bits[s] >> table_shift` at the burst exit
    // and written back to `decoders[s].state` for the drain phase.
    //
    // Composition matches donor `HUF_DecompressFastArgs_init` and
    // `HUF_4X1_RELOAD_STREAM` (huf_decompress.c:795-804): each iter
    // reloads `bits[s] = MEM_read64(ip[s]) | 1; bits[s] <<= nb_bits`
    // after advancing `ip[s] -= nb_bytes` (where nb_bytes/nb_bits
    // come from `ctz(bits[s])` at the end of the previous iter).
    // Initial composition exactly mirrors donor `HUF_DecompressFastArgs_init`:
    // `bits[s] = (MEM_read64(ip) | 1) << padding_skip`. Top `max_num_bits`
    // of the result is the state value implicitly (HUF stream encoding
    // ensures the top max bits of unconsumed stream at any consumption
    // point = current state machine state), so we don't inject
    // `decoders[s].state` explicitly here — the bit pattern already
    // carries it.
    //
    // `padding_skip = brs[s].bits_consumed - max_num_bits`: `init_state`
    // pre-consumed `max_num_bits` for `decoders[s].state`, so
    // `brs[s].bits_consumed = padding_skip + max_num_bits`. Donor leaves
    // state implicit; we reverse our pre-consumption by shifting only
    // by `padding_skip` (not by `bits_consumed`) so the top max bits
    // come from the unshifted stream-position-of-state.
    //
    // Sentinel ends up at bit `padding_skip` after the shift, so
    // `ctz(initial bits[s]) = padding_skip` and the first reload's
    // `nb_bytes = (padding_skip + K) / 8` matches donor's byte-cursor
    // advance from absolute stream position 0.
    let mut bits: [u64; 4] = [
        (brs[0].bit_container | 1) << (brs[0].bits_consumed - max_num_bits),
        (brs[1].bit_container | 1) << (brs[1].bits_consumed - max_num_bits),
        (brs[2].bit_container | 1) << (brs[2].bits_consumed - max_num_bits),
        (brs[3].bit_container | 1) << (brs[3].bits_consumed - max_num_bits),
    ];
    let mut ip: [usize; 4] = [brs[0].index, brs[1].index, brs[2].index, brs[3].index];
    // Sub-byte phase of the consumption point in the current 8-byte
    // window of `brs[s]`. Initial value mirrors the post-init reader
    // state: drain compatibility wants `bits_consumed = nb_bits + max_num_bits`,
    // so `nb_bits_last[s] = brs[s].bits_consumed - max_num_bits` for the
    // pre-reload writeback path (no burst iter ran). After the first
    // reload `nb_bits_last[s] = ctz & 7` (sub-byte phase of donor's
    // `MEM_read64 + shift`).
    let mut nb_bits_last: [u8; 4] = [
        brs[0].bits_consumed - max_num_bits,
        brs[1].bits_consumed - max_num_bits,
        brs[2].bits_consumed - max_num_bits,
        brs[3].bits_consumed - max_num_bits,
    ];

    // Donor `iiters` safety budget. Worst-case `nb_bytes` per iter is
    // `floor(ctz_max / 8)` where `ctz_max = pad_max + burst_bits`,
    // since at the first iter the sentinel starts at `padding_skip
    // ∈ [1, 8]` and on subsequent iters at `nb_bits ∈ [0, 7]` set by
    // the previous reload's `(MEM_read64 | 1) << nb_bits`. Taking
    // `pad_max = 8` covers both regimes — without the `+8` slack,
    // burst configurations where `burst_bits` is a multiple of 8
    // (e.g. max=8 -> burst_bits=48) accept a `min_ip` that
    // `nb_bytes` then overruns, underflowing `ip[s] -= nb_bytes`.
    // The check below ensures `ip[s] >= bytes_per_iter_upper` for
    // every stream before entering an iter, so per-iter `ip[s] -=
    // nb_bytes` plus the subsequent `source[ip[s]..ip[s]+8]` read
    // both stay in-bounds without per-stream conditionals.
    let bytes_per_iter_upper = (8 + burst_bits as usize) / 8;
    let mut any_iter = false;

    while cursors[0] <= cursor_burst_ceil {
        let min_ip = ip[0].min(ip[1]).min(ip[2]).min(ip[3]);
        if min_ip < bytes_per_iter_upper {
            break;
        }
        any_iter = true;

        // Inner: decode `symbols_per_burst` symbols × 4 streams.
        //
        // SAFETY for `packed.get_unchecked(idx)`:
        //   `idx = (bits[s] >> table_shift) as usize` with
        //   `table_shift = 64 - max_num_bits` lands in
        //   `[0, 1 << max_num_bits)`. `packed.len() == 1 << max_num_bits`
        //   by `HuffmanTable::build_decoder`'s upfront `resize`.
        //
        // SAFETY for `target.get_unchecked_mut(cursors[s])`:
        //   The outer-loop gate `cursors[0] <= cursor_burst_ceil`
        //   gives `cursors[0] + symbols_per_burst <= cursor_burst_ceil
        //   + symbols_per_burst = starts[0] + min_seg_len`. By lockstep
        //   advance, `cursors[s] - starts[s] == cursors[0] - starts[0]`
        //   for all `s`, so `cursors[s] + symbols_per_burst - 1 <
        //   starts[s] + min_seg_len <= ends[s] <= target.len()` —
        //   every write in this iter (max index `cursors[s] +
        //   symbols_per_burst - 1`) is strictly in-bounds.
        debug_assert!(cursors[0] + symbols_per_burst <= cursor_burst_ceil + symbols_per_burst);
        for _ in 0..symbols_per_burst {
            let idx0 = (bits[0] >> table_shift) as usize;
            let entry0 = unsafe { *packed.get_unchecked(idx0) };
            unsafe { *target.get_unchecked_mut(cursors[0]) = (entry0 & 0xFF) as u8 };
            cursors[0] += 1;
            bits[0] <<= (entry0 >> 8) & 0xFF;

            let idx1 = (bits[1] >> table_shift) as usize;
            let entry1 = unsafe { *packed.get_unchecked(idx1) };
            unsafe { *target.get_unchecked_mut(cursors[1]) = (entry1 & 0xFF) as u8 };
            cursors[1] += 1;
            bits[1] <<= (entry1 >> 8) & 0xFF;

            let idx2 = (bits[2] >> table_shift) as usize;
            let entry2 = unsafe { *packed.get_unchecked(idx2) };
            unsafe { *target.get_unchecked_mut(cursors[2]) = (entry2 & 0xFF) as u8 };
            cursors[2] += 1;
            bits[2] <<= (entry2 >> 8) & 0xFF;

            let idx3 = (bits[3] >> table_shift) as usize;
            let entry3 = unsafe { *packed.get_unchecked(idx3) };
            unsafe { *target.get_unchecked_mut(cursors[3]) = (entry3 & 0xFF) as u8 };
            cursors[3] += 1;
            bits[3] <<= (entry3 >> 8) & 0xFF;
        }

        // Reload all 4 streams (donor `HUF_4X1_RELOAD_STREAM`).
        //
        // SAFETY:
        //   * `ip[s] - nb_bytes >= 0`: the `min_ip >= bytes_per_iter_upper`
        //     gate at outer-loop entry guarantees `nb_bytes <= bytes_per_iter_upper`
        //     (where `nb_bytes = ctz(bits[s]) >> 3` and `ctz <= padding_skip
        //     + burst_bits <= 8 + burst_bits`, the bound `bytes_per_iter_upper`
        //     pre-computes).
        //   * `ip[s] + 8 <= source.len()`: `BitReaderReversed::new()`
        //     starts with `bits_consumed = 64`, so the very first
        //     `get_bits(1)` in the per-stream padding-skip loop
        //     above triggers `refill()`. For `source.len() >= 8` that
        //     fast-path establishes `brs[s].index = source.len() - 8`;
        //     `init_state`'s subsequent `get_bits(max_num_bits)`
        //     stays inside the same 8-byte window without another
        //     refill (only `bits_consumed` advances). The
        //     `refill_slow` path used for shorter streams leaves
        //     `index = 0` (with the partial bytes left-shifted into
        //     `bit_container`), making `min_ip = 0 <
        //     bytes_per_iter_upper` so the burst loop exits via
        //     `any_iter = false` BEFORE reaching this reload (the
        //     writeback below is unreachable on `source.len() < 8`).
        //     Within the loop, `ip[s]` only decreases via the line
        //     above this comment, preserving the upper bound.
        for s in 0..4 {
            let ctz = bits[s].trailing_zeros();
            let nb_bytes = (ctz >> 3) as usize;
            let nb_bits = (ctz & 7) as u8;
            ip[s] -= nb_bytes;
            let new_window = u64::from_le_bytes(unsafe {
                brs[s]
                    .source
                    .get_unchecked(ip[s]..ip[s] + 8)
                    .try_into()
                    .unwrap_unchecked()
            });
            // Donor `HUF_4X1_RELOAD_STREAM` order: `(MEM_read64 | 1) << nb_bits`,
            // NOT `(MEM_read64 << nb_bits) | 1`. The two are NOT equivalent —
            // the former puts the sentinel at bit `nb_bits` (so `ctz` of the
            // post-reload register accumulates the sub-byte phase into the
            // NEXT reload's `ctz`), the latter resets the sentinel to bit 0
            // and loses the phase between reloads.
            bits[s] = (new_window | 1) << nb_bits;
            nb_bits_last[s] = nb_bits;
        }
    }

    // No iter ran → nothing changed in `brs[s]` / `decoders[s]`; the
    // drain phase below picks up from the post-`init_state` reader.
    if !any_iter {
        return;
    }

    // Write back to `brs[s]` + `decoders[s].state` so the drain phase
    // (single-symbol `decode_symbol_and_advance`) picks up where the
    // burst stopped. The burst's final `bits[s]` is post-reload
    // (`= (new_window << nb_bits) | 1`), and `nb_bits_last[s]` holds
    // the sub-byte phase used in that reload. Drain's read frontier
    // sits at `nb_bits_last + max_num_bits` bits into the topmost
    // window byte: `nb_bits_last` of padding-skip already aligned by
    // the burst's reload shift, plus `max_num_bits` for the state we
    // just extracted to `decoders[s].state`.
    for s in 0..4 {
        brs[s].index = ip[s];
        brs[s].bit_container = u64::from_le_bytes(unsafe {
            brs[s]
                .source
                .get_unchecked(ip[s]..ip[s] + 8)
                .try_into()
                .unwrap_unchecked()
        });
        brs[s].bits_consumed = nb_bits_last[s] + max_num_bits;
        decoders[s].state = bits[s] >> table_shift;
    }
}

#[cfg(test)]
mod zerocopy_robustness_tests {
    //! Regression coverage for `decode_literals_zerocopy` on
    //! truncated / corrupt payloads: every branch must return a
    //! structured error instead of panicking on out-of-bounds
    //! slice indexing. Hit each `*[..n]` / `*[0]` index in the
    //! function with a payload one byte short of what the header
    //! declares.
    //
    // Tests live in a separate module so the broader `burst_gate_tests`
    // module's helpers don't have to depend on truncated-input
    // builders.
    use super::{LiteralsView, decode_literals_zerocopy};
    use crate::blocks::literals_section::{LiteralsSection, LiteralsSectionType};
    use crate::decoding::scratch::HuffmanScratch;
    use crate::huff0::HuffmanTable;
    use alloc::vec::Vec;

    fn raw_section(regen: u32) -> LiteralsSection {
        LiteralsSection {
            ls_type: LiteralsSectionType::Raw,
            regenerated_size: regen,
            compressed_size: None,
            num_streams: None,
        }
    }

    fn rle_section(regen: u32) -> LiteralsSection {
        LiteralsSection {
            ls_type: LiteralsSectionType::RLE,
            regenerated_size: regen,
            compressed_size: None,
            num_streams: None,
        }
    }

    fn fresh_scratch() -> HuffmanScratch {
        HuffmanScratch {
            table: HuffmanTable::new(),
        }
    }

    #[test]
    fn raw_truncated_source_returns_error_no_panic() {
        // Header claims 10 raw literal bytes, source carries 3.
        // Indexing `source[0..10]` would panic; the fix must turn
        // it into a structured DecompressLiteralsError.
        let section = raw_section(10);
        let source: [u8; 3] = [1, 2, 3];
        let mut target: Vec<u8> = Vec::new();
        let mut scratch = fresh_scratch();
        let result = decode_literals_zerocopy(&section, &mut scratch, &source, &mut target);
        assert!(
            result.is_err(),
            "truncated raw source must error, not panic; got {:?}",
            result.map(|_| ())
        );
    }

    #[test]
    fn rle_empty_source_returns_error_no_panic() {
        // RLE section needs at least one source byte (the fill byte).
        // Indexing `source[0]` on an empty slice would panic.
        let section = rle_section(10);
        let source: [u8; 0] = [];
        let mut target: Vec<u8> = Vec::new();
        let mut scratch = fresh_scratch();
        let result = decode_literals_zerocopy(&section, &mut scratch, &source, &mut target);
        assert!(
            result.is_err(),
            "empty RLE source must error, not panic; got {:?}",
            result.map(|_| ())
        );
    }

    #[test]
    fn rle_view_excludes_pre_existing_target_bytes() {
        // Even if the caller forgot to clear `target`, the returned
        // LiteralsView::data must point only at the bytes this call
        // produced. The API hardening (`&target[base..]`) is what
        // makes this hold.
        let mut target: Vec<u8> = Vec::from([0xAA, 0xBB, 0xCC]);
        let section = rle_section(4);
        let source: [u8; 1] = [0x42];
        let mut scratch = fresh_scratch();
        let view = decode_literals_zerocopy(&section, &mut scratch, &source, &mut target)
            .expect("RLE with valid source must succeed");
        assert_eq!(view.data.len(), 4, "view length must match regen_size");
        assert!(
            view.data.iter().all(|&b| b == 0x42),
            "view must contain only the newly-RLE-expanded bytes, got {:?}",
            view.data
        );
        // Silence unused-warning if the compiler ever strips
        // LiteralsView fields — read bytes_used too.
        let _ = LiteralsView {
            data: view.data,
            bytes_used: view.bytes_used,
        };
    }
}

#[cfg(test)]
mod burst_gate_tests {
    //! Regression coverage for the HUF 4-stream burst-gate boundary
    //! states in `decompress_literals`:
    //!
    //!   1. `bits_consumed == max_num_bits` — lower boundary of the
    //!      burst gate, where the gate is entered with zero slack.
    //!   2. `bits_consumed + burst_bits == 64` — upper boundary, where
    //!      the burst consumes all remaining bits in the 64-bit window
    //!      without overflow.
    //!   3. SIMD-fallback → refill → burst re-entry — outer loop falls
    //!      back to the SIMD 4-symbol path, a `BitReaderReversed`
    //!      refill occurs, the next iteration re-enters the burst path
    //!      once `bits_consumed` grows back into burst range.
    //!
    //! Each named test pins an input shape chosen to drive the gate
    //! through the corresponding regime — short skewed input for the
    //! initial-entry lower-bound, long mid-cardinality streams for
    //! many upper-bound brushes, multi-segment input for repeated
    //! SIMD↔burst transitions. The sweep test covers the gate in
    //! aggregate across many `(size, alphabet)` combinations.
    //!
    //! These tests do NOT assert that a specific
    //! `(bits_consumed, burst_bits)` configuration is hit deterministically
    //! on any single iteration — that would require white-box state
    //! instrumentation that the current decoder does not expose. They
    //! assert end-to-end roundtrip correctness through the full
    //! encoder → 4-stream HUF block → `decode_literals` path; a
    //! burst-gate regression that returns the wrong symbol or
    //! desynchronises a stream produces either a
    //! `DecompressLiteralsError` from the `BitstreamReadMismatch` /
    //! `DecodedLiteralCountMismatch` guards or a mismatched decoded
    //! buffer — both fail the assertion. The `max_num_bits` range
    //! checks in the per-test helper also detect silent drift where
    //! the encoder's table-generation choice shifts the test out of
    //! the intended gate regime.
    use super::*;
    use crate::bit_io::BitWriter;
    use crate::blocks::literals_section::{LiteralsSection, LiteralsSectionType};
    use crate::decoding::scratch::HuffmanScratch;
    use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as EncTable};
    use alloc::vec::Vec;

    /// Encode `data` as a 4-stream HUF Compressed literals block (table
    /// description + jump table + 4 padded streams) and return the
    /// matching `LiteralsSection` header plus the wire bytes.
    fn build_huf4x_block(data: &[u8]) -> (LiteralsSection, Vec<u8>) {
        assert!(data.len() >= 4, "encode4x requires at least 4 bytes");
        let table = EncTable::build_from_data(data);
        let mut source: Vec<u8> = Vec::new();
        {
            let mut writer = BitWriter::from(&mut source);
            let mut encoder = HuffmanEncoder::new(&table, &mut writer);
            encoder.encode4x(data, true);
            writer.flush();
        }
        let section = LiteralsSection {
            ls_type: LiteralsSectionType::Compressed,
            regenerated_size: data.len() as u32,
            compressed_size: Some(source.len() as u32),
            num_streams: Some(4),
        };
        (section, source)
    }

    /// Roundtrip `data` through encode4x + decode_literals and assert
    /// the decoded buffer matches byte-for-byte. Returns the HUF table's
    /// `max_num_bits` so call sites can sanity-check that they actually
    /// hit the expected burst-gate regime.
    fn roundtrip_assert(data: &[u8]) -> u8 {
        let (section, source) = build_huf4x_block(data);
        let mut scratch = HuffmanScratch::new();
        let mut target = Vec::new();
        let bytes_read = decode_literals(&section, &mut scratch, &source, &mut target)
            .expect("decode_literals must succeed on a well-formed roundtrip");
        assert_eq!(
            bytes_read as usize,
            source.len(),
            "decoder must consume every byte of the literals block"
        );
        assert_eq!(
            target, data,
            "decoded literals must match the encoder input"
        );
        scratch.table.max_num_bits
    }

    /// Roundtrip + assertion that the HUF table's `max_num_bits` falls
    /// inside the expected range — this is what selects which burst-gate
    /// regime the body runs under (`symbols_per_burst = (63 - max) / max`).
    fn roundtrip_with_max_bits_range(data: &[u8], expected: core::ops::RangeInclusive<u8>) {
        let m = roundtrip_assert(data);
        assert!(
            expected.contains(&m),
            "max_num_bits {} outside expected range {:?} for this fixture — \
             test no longer exercises the intended gate regime",
            m,
            expected
        );
    }

    /// Lower boundary: targets `bits_consumed == max_num_bits` on
    /// early burst entries.
    ///
    /// A short stream with a skewed 23-symbol alphabet keeps
    /// `max_num_bits` in the 5..=11 band and limits the number of
    /// burst iterations, so early iterations run with `bits_consumed`
    /// near the gate threshold. The decoder must not lose low stream
    /// bits when the shift formula runs close to the threshold;
    /// roundtrip correctness over short input is the regression signal.
    #[test]
    fn burst_gate_lower_boundary_short_skewed_alphabet() {
        // 36 bytes, 23 distinct symbols, skewed distribution —
        // encoder picks max_num_bits in the 5..=11 band.
        let mut data: Vec<u8> = Vec::with_capacity(36);
        data.extend_from_slice(&[
            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
            14, 15, 16, 17, 18, 19, 20, 21, 22,
        ]);
        roundtrip_with_max_bits_range(&data, 5..=11);
    }

    /// Upper boundary: `bits_consumed + burst_bits == 64`.
    ///
    /// A long, mid-cardinality alphabet drives many full burst windows.
    /// Across thousands of iterations the burst-fits-in-64 guard
    /// (`bits_consumed + burst_bits <= 64`) is approached and met
    /// exactly. A regression that miscalculated the upper boundary
    /// would read past the loaded 8-byte window and either crash under
    /// debug bounds checks or desynchronise the stream — either way
    /// the roundtrip fails.
    #[test]
    fn burst_gate_upper_boundary_long_mid_alphabet() {
        // 4 KiB with a 97-symbol pseudo-random alphabet (kept under the
        // encoder's 128-weight raw-table limit). Broad distribution →
        // max_num_bits ≈ 7..9, thousands of burst iterations across all
        // four streams.
        let mut data: Vec<u8> = Vec::with_capacity(4096);
        for i in 0..4096u32 {
            data.push((i.wrapping_mul(0x9E37_79B1) % 97) as u8);
        }
        roundtrip_with_max_bits_range(&data, 6..=11);
    }

    /// SIMD-fallback → refill → burst re-entry transition.
    ///
    /// After a `BitReaderReversed::refill` (triggered inside
    /// `advance_state_by_bits` on the SIMD path), `bits_consumed`
    /// rebases to `[0, 7]`. Until it climbs back to `max_num_bits` the
    /// burst gate is closed and the outer loop runs the 4-symbol SIMD
    /// fallback; on the next outer-loop iteration after `bits_consumed`
    /// grows past `max_num_bits` the burst path must re-enter cleanly.
    ///
    /// Stream length of 16 KiB / 4 ≈ 4 KiB per stream encoded ⇒ each
    /// `BitReaderReversed` window crosses many refill boundaries,
    /// guaranteeing the SIMD→refill→burst transition fires repeatedly.
    #[test]
    fn burst_simd_fallback_refill_reentry_long_streams() {
        // 67-symbol modulo distribution (`i % 67`, prime modulus spreads
        // the alphabet evenly) → max_num_bits typically 7..8, which gives
        // `symbols_per_burst = (63 - max) / max ≈ 6..8`.
        let mut data: Vec<u8> = Vec::with_capacity(16 * 1024);
        for i in 0..16 * 1024u32 {
            data.push((i % 67) as u8);
        }
        roundtrip_with_max_bits_range(&data, 5..=8);
    }

    /// Parametric sweep across stream lengths and alphabet shapes.
    ///
    /// The three burst-gate states above are also hit across this matrix
    /// at varying `(bits_consumed, max_num_bits, symbols_per_burst)`
    /// configurations; any future tweak to the gate that mishandles a
    /// specific `(max_num_bits, post-refill bits_consumed)` combo trips
    /// at least one cell here.
    #[test]
    fn burst_gate_sweep_sizes_and_alphabets() {
        let sizes = [
            16usize, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256, 257, 511, 512, 513, 1023,
            1024, 1025, 4096,
        ];
        for &n in &sizes {
            // Binary alphabet → max_num_bits == 1, symbols_per_burst large.
            let mut bin: Vec<u8> = Vec::with_capacity(n);
            for i in 0..n {
                bin.push((i & 1) as u8);
            }
            roundtrip_assert(&bin);

            // 16-symbol uniform alphabet → max_num_bits ≈ 4.
            let mut sm: Vec<u8> = Vec::with_capacity(n);
            for i in 0..n {
                sm.push((i % 16) as u8);
            }
            roundtrip_assert(&sm);

            // 97-symbol pseudo-random alphabet (where length permits) →
            // max_num_bits ≈ 7..9; kept under the encoder's 128-weight
            // raw-table cap so the encoder reliably succeeds.
            if n >= 128 {
                let mut wide: Vec<u8> = Vec::with_capacity(n);
                for i in 0..n {
                    wide.push((i.wrapping_mul(2_654_435_761) % 97) as u8);
                }
                roundtrip_assert(&wide);
            }
        }
    }

    /// Adversarial regression for the `burst_eligible` safety gate.
    ///
    /// Builds a valid 4-stream HUF block, then forges a `LiteralsSection`
    /// header that claims `regenerated_size = 1` while the encoded
    /// streams still contain a full block worth of symbols. The shrunk
    /// `regenerated_size` collapses `min_seg_len` below
    /// `symbols_per_burst`, the exact precondition `burst_eligible`
    /// guards against. Without that gate, the burst inner loop would
    /// advance `cursors[i]` past `ends[i]` and panic on the
    /// `target[cursors[i]]` write — a DoS surface on malformed input.
    ///
    /// With the gate, the decoder either:
    ///   - falls through to the SIMD-fallback path which immediately
    ///     hits the top-of-loop `cursor_exit_olimit` exit and returns
    ///     a count-mismatch / bitstream-mismatch error, or
    ///   - returns an error before the loop ever runs.
    ///
    /// Either way the test asserts `Err(_)` — the contract is "no
    /// panic, return an error".
    #[test]
    fn burst_gate_malformed_small_regen_returns_error() {
        // 256 bytes is well above MIN_LITERALS_FOR_4_STREAMS so the
        // encoder will happily emit a 4-stream HUF block. The modulo
        // alphabet keeps `max_num_bits` small (≤ 8), maximising
        // `symbols_per_burst` so the small forged `regenerated_size`
        // sits well below it.
        let mut data: Vec<u8> = Vec::with_capacity(256);
        for i in 0..256u32 {
            data.push((i % 67) as u8);
        }
        let (mut section, source) = build_huf4x_block(&data);

        // Forge: claim only 1 regenerated byte. Streams in `source`
        // are still encoded for the full 256-byte input.
        section.regenerated_size = 1;

        let mut scratch = HuffmanScratch::new();
        let mut target = Vec::new();
        let result = decode_literals(&section, &mut scratch, &source, &mut target);

        assert!(
            result.is_err(),
            "decoder must reject the malformed header instead of panicking; \
             got Ok({})",
            result.unwrap_or(0)
        );
    }
}