structured-zstd 0.0.26

//! User-slice-backed output buffer for the "decode straight into the
//! caller's output slice" fast path.
//!
//! Selected automatically by
//! [`crate::decoding::FrameDecoder::decode_all`] (and
//! [`crate::decoding::FrameDecoder::decode_all_to_vec`], which
//! reserves the extra slack internally) when ALL of the following
//! hold:
//! - `frame_content_size > 0` — the header-derived content size
//!   is non-zero. This is the actual eligibility condition (NOT
//!   "FCS present"): an empty frame with an explicit FCS=0
//!   declaration on the wire stays on the fallback path because
//!   there is no payload to write into the user slice. To
//!   distinguish "FCS absent" from "FCS=0 explicit" elsewhere in
//!   the decoder, use `FrameHeader::fcs_declared()` (e.g. the
//!   fallback path's post-decode size check does).
//! - `output.len() >= frame_content_size + WILDCOPY_OVERLENGTH`
//!   (room for the SIMD wildcopy overshoot slack).
//! - No active dictionary (the persistent dict_content is not
//!   carried into the stack-local DecodeBuffer this backend
//!   builds; dict frames stay on the regular path).
//!
//! `content_checksum_flag` is NOT a precondition: when set, the
//! direct-decode caller walks `output[..content_size]` once at end
//! of decode (single sequential xxhash pass over cache-hot data)
//! and stores the digest into the persistent scratch's hasher so
//! `get_calculated_checksum()` reads the right value.
//!
//! Multi-segment frames work via the caller's per-block
//! `DecodeBuffer::drop_to_window_size` invocation — bytes drop
//! out of `len()`'s visible range once decoded output exceeds
//! `window_size`, but physically stay in the user's slice (this
//! backend's `drop_first_n` only advances `head`).
//!
//! The `drop_to_window_size` call runs only BETWEEN blocks, so
//! within a single block `len()` can temporarily exceed
//! `window_size`. `DecodeBuffer::repeat` validates match offsets
//! against `len()` (not `window_size`), so this is not a strict
//! enforcement of the spec's `offset <= window_size` rule — only
//! a coarse end-of-block cap. The fallback path
//! (`FlatBuf`/`RingBuffer`) shares the same limitation. Strict
//! in-block offset bounds would require additional validation
//! that neither path currently performs.
//!
//! When eligible, literal pushes and match-history copies write
//! directly into the user's slice. Compared to
//! `DecodeBuffer<FlatBuf>`, this elides one full `memmove` of the
//! live region (the `read` drain that copies the flat Vec into the
//! user slice) and one anonymous-page allocation cycle per frame.
//! On `level_-7_fast/decodecorpus-z000033/rust_stream` the
//! direct-write path measured -20.33% vs the FlatBuf+drain path on
//! i9-9900K — see #244 for the flamegraph.
//!
//! Selected at compile time via `DecodeBuffer<UserSliceBackend<'a>>`
//! (generic [`BufferBackend`](super::buffer_backend::BufferBackend)
//! parameter). The lifetime parameter binds the backend to the
//! user-provided slice — the backing
//! `DecodeBuffer<UserSliceBackend<'a>>` is stack-local in
//! `decode_all` and does not survive across calls. Persistent
//! decoder state (HUF/FSE tables, offset_hist, sequence cache)
//! lives in `FrameDecoder` and is borrowed in by reference for the
//! call's duration via [`super::scratch::DirectScratch`].

use crate::io::{Error, Read};

use super::buffer_backend::{BufferBackend, WILDCOPY_OVERLENGTH};

/// Backend that writes directly into a caller-provided `&mut [u8]`
/// output slice. No internal allocation, no drain copy.
///
/// Invariants enforced by the [`BufferBackend`] surface:
/// - `head <= tail <= slice.len()`.
/// - All bytes in `slice[head..tail]` are initialised (written by
///   [`Self::extend`] / [`Self::extend_and_fill`] /
///   [`Self::extend_from_within_unchecked`] /
///   [`Self::extend_from_reader`]).
/// - Bytes in `slice[tail..]` are initialised memory (the slice was
///   passed in as safe `&mut [u8]` so every element is a valid `u8`)
///   but hold contents the decoder has not yet written — they carry
///   whatever the caller put in the buffer before passing it in.
///   The FlatBuf precedent skips zero-pre-fill on `extend` for the
///   same reason: writes happen at the band `[tail, tail + n)` and
///   the rest stays unread by the decoder. Callers must not read
///   past `tail` and expect meaningful decode output.
///
/// The caller MUST size the output slice with at least
/// `frame_content_size + WILDCOPY_OVERLENGTH` bytes so SIMD wildcopy
/// overshoots from `extend_from_within_unchecked` stay inside the
/// allocation. The dispatch site in [`crate::decoding::FrameDecoder`]
/// validates this precondition.
///
/// # Safety contract on malformed Compressed blocks
///
/// The safe public decode APIs ([`crate::decoding::FrameDecoder::decode_all`]
/// and [`crate::decoding::FrameDecoder::decode_all_to_vec`]) route
/// through the FALLIBLE write surface:
/// [`Self::try_extend`] / [`Self::try_extend_and_fill`] /
/// [`Self::try_extend_from_within`] for direct writes,
/// [`super::buffer_backend::BufferBackend::try_reserve`] for the
/// match-repeat pre-check inside `DecodeBuffer::repeat_inner`, and
/// `exec_sequence_inline` (which returns
/// `Result<(), ExecuteSequencesError>`). A malformed Compressed
/// block whose payload expands past the declared
/// `frame_content_size` surfaces as
/// `ExecuteSequencesError::OutputBufferOverflow` (literal-push /
/// donor-inline path) or `DecodeBufferError::OutputBufferOverflow`
/// (match-repeat path), both of which propagate up the call stack
/// as a structured `FrameDecoderError` instead of panicking.
///
/// The INFALLIBLE entry points (`extend`, `extend_and_fill`,
/// `extend_from_within_unchecked`) remain on the type as defense in
/// depth and as the call shape for inner unsafe blocks where
/// capacity has already been validated by the wrapping `try_*` call.
/// Each retains a release-mode `assert!` so a future caller that
/// invokes the infallible entry point directly with an OOB length
/// fails with a clear diagnostic rather than letting the subsequent
/// unsafe pointer math reach past `slice.len()`. The safe public
/// APIs never reach these `assert!`s on malformed input — the
/// fallible dispatch catches the overshoot one layer up.
pub(crate) struct UserSliceBackend<'a> {
    slice: &'a mut [u8],
    /// Bytes in `slice[..head]` have been drained to the output
    /// stream and are no longer visible through the [`BufferBackend`]
    /// surface. Same semantics as `FlatBuf.head` — see that field's
    /// doc for the "drained prefix remains physically present, used
    /// by future match copies" justification. For the
    /// single-segment direct-decode path `head` stays at 0 until the
    /// frame finishes (no streaming-drain), but the field is kept
    /// for API parity with `FlatBuf` and `RingBuffer`.
    head: usize,
    tail: usize,
}

impl<'a> UserSliceBackend<'a> {
    /// Construct a backend wrapping `slice`. The slice must have at
    /// least `frame_content_size + WILDCOPY_OVERLENGTH` bytes of
    /// length so SIMD wildcopy overshoots stay inside the allocation;
    /// the dispatcher in `FrameDecoder` enforces this.
    pub(crate) fn from_slice(slice: &'a mut [u8]) -> Self {
        Self {
            slice,
            head: 0,
            tail: 0,
        }
    }
}

impl<'a> BufferBackend for UserSliceBackend<'a> {
    /// Donor-shape inline `ZSTD_execSequence` is available on
    /// `x86_64`, where SSE2 is the architectural baseline and the
    /// helpers in [`super::exec_sequence_inline::x86`] emit
    /// `_mm_loadu_si128` / `_mm_storeu_si128` without needing a
    /// `#[target_feature]` gate. 32-bit `x86` is excluded — its
    /// pre-SSE2 baseline (i386 / i486 / i586) would SIGILL on the
    /// SSE2 intrinsics. aarch64 NEON, riscv etc. fall back to the
    /// existing `extend` + `repeat` chain — those paths already
    /// emit the architecture's best-available SIMD via
    /// `copy_bytes_overshooting`'s runtime detect.
    const SUPPORTS_INLINE_SEQUENCE_EXEC: bool = cfg!(target_arch = "x86_64");

    /// Donor `ZSTD_execSequence` body — see trait doc for
    /// preconditions / contract.
    #[cfg(target_arch = "x86_64")]
    #[inline(always)]
    unsafe fn exec_sequence_inline(
        &mut self,
        lit_src: *const u8,
        lit_length: usize,
        offset: usize,
        match_length: usize,
    ) -> Result<(), super::errors::ExecuteSequencesError> {
        use super::errors::ExecuteSequencesError;
        use super::exec_sequence_inline::x86::{
            copy16, overlap_copy8, wildcopy_no_overlap, wildcopy_overlap_8byte_stride,
        };
        // Fallible capacity check — covers the literal+match copies
        // INCLUDING the wildcopy overshoot tail (up to 15 bytes past
        // `tail + total`). On a well-formed frame the caller-side
        // `WILDCOPY_OVERLENGTH = 32` slack on the user slice absorbs
        // it; on a malformed frame whose sequences overproduce past
        // `frame_content_size`, the check returns
        // `ExecuteSequencesError::OutputBufferOverflow` so the
        // safe public decode APIs (`decode_all`, `decode_all_to_vec`)
        // surface a structured `FrameDecoderError` rather than
        // panic on the unsafe write surface.
        //
        // All sums use `checked_*` so adversarial input that would
        // wrap `usize` produces the same error variant instead of
        // wrapping past the slice length and letting the subsequent
        // unsafe pointer math go out of bounds.
        const MAX_WILDCOPY_OVERSHOOT: usize = 15;
        let cap = self.slice.len();
        // `requested` reports the LOGICAL write length
        // (`lit_length + match_length`) to stay consistent with
        // `BackendOverflow.requested` on the `try_*` paths. The
        // capacity check itself uses `tail + total + overshoot`
        // because the unconditional 16-byte `copy16` over-reaches
        // `tail + total` by up to 15 bytes — but that overshoot is
        // an artefact of the SIMD copy shape, NOT a value the
        // caller can act on, so it doesn't belong in the diagnostic.
        let total = match lit_length.checked_add(match_length) {
            Some(v) => v,
            None => {
                return Err(ExecuteSequencesError::OutputBufferOverflow {
                    tail: self.tail,
                    requested: usize::MAX,
                    capacity: cap,
                });
            }
        };
        let cap_required = self
            .tail
            .checked_add(total)
            .and_then(|new_tail| new_tail.checked_add(MAX_WILDCOPY_OVERSHOOT));
        let cap_required = match cap_required {
            Some(v) if v <= cap => v,
            _ => {
                return Err(ExecuteSequencesError::OutputBufferOverflow {
                    tail: self.tail,
                    requested: total,
                    capacity: cap,
                });
            }
        };
        let _ = cap_required;
        let new_tail = self.tail + total;
        debug_assert!(offset >= 1);
        debug_assert!(match_length >= 1);
        // Match against the LIVE window (tail - head) per the trait
        // contract, not `tail`. On single-segment frames head stays
        // at 0 so the two are equivalent; on multi-segment frames
        // `drop_to_window_size` advances `head` and asserting
        // against raw `tail` would mask offsets that reach past the
        // window boundary into dropped history.
        let live_len = self.tail - self.head;
        debug_assert!(
            live_len
                .checked_add(lit_length)
                .is_some_and(|end| offset <= end),
            "exec_sequence_inline: offset ({}) exceeds live window (len={} + lit={}, head={}, tail={})",
            offset,
            live_len,
            lit_length,
            self.head,
            self.tail,
        );

        // SAFETY: capacity asserted above; pointer arithmetic stays
        // within `self.slice` for the writes (tail + total <=
        // slice.len()) and reads (match src = tail + lit_length -
        // offset, bounded by `offset <= tail + lit_length`). Literal
        // reads use the caller-provided `lit_src` whose provenance
        // covers the parent literals buffer (NOT a sub-slice), so
        // the unconditional 16-byte load stays in-bounds even when
        // `lit_length < 16`.
        unsafe {
            let base_mut = self.slice.as_mut_ptr();

            // ── Literal copy ──
            // Donor: ZSTD_copy16(op, *litPtr); if (litLength > 16)
            //         wildcopy(op+16, *litPtr+16, litLength-16, no_overlap)
            //
            // The unconditional first copy16 may overshoot up to 15
            // bytes past `op + lit_length` into the match-destination
            // region — those bytes are about to be overwritten by the
            // match copy below, so the overshoot is harmless.
            let op_lit = base_mut.add(self.tail);
            copy16(op_lit, lit_src);
            if lit_length > 16 {
                wildcopy_no_overlap(op_lit.add(16), lit_src.add(16), lit_length - 16);
            }

            // ── Match copy ──
            // Donor uses `oLitEnd = op + litLength` as the match
            // destination; the match source is `oLitEnd - offset`.
            let op_match = base_mut.add(self.tail + lit_length);
            let match_src = base_mut.cast_const().add(self.tail + lit_length - offset);

            if offset >= 16 {
                wildcopy_no_overlap(op_match, match_src, match_length);
            } else {
                // overlap_copy8 + wildcopy(overlap) tail.
                let (op2, ip2) = overlap_copy8(op_match, match_src, offset);
                if match_length > 8 {
                    wildcopy_overlap_8byte_stride(op2, ip2, match_length - 8);
                }
            }
        }
        self.tail = new_tail;
        Ok(())
    }

    /// `new()` exists for trait conformance but is not used on the
    /// direct-decode path — the slice is always provided up-front via
    /// [`Self::from_slice`]. Returns an empty backend wrapping an
    /// empty static slice; any subsequent `extend` call will panic
    /// via the capacity check.
    ///
    /// `&mut []` is a zero-length placeholder slice the compiler
    /// emits with `'static` lifetime; assigning it into the
    /// `slice: &'a mut [u8]` field compiles for any `'a` because
    /// `'static` outlives every other lifetime. No aliasing concern
    /// because the length is 0 (no addressable bytes the field
    /// could alias against). No raw-pointer + PhantomData workaround
    /// needed — verified by `cargo check` + the full nextest suite.
    /// This placeholder shape is fine ONLY because `new()` is never
    /// the entry point on the direct-decode path; non-empty
    /// constructions go through `from_slice` with a real `&'a mut [u8]`.
    fn new() -> Self {
        Self {
            slice: &mut [],
            head: 0,
            tail: 0,
        }
    }

    #[inline]
    fn clear(&mut self) {
        self.head = 0;
        self.tail = 0;
    }

    #[inline(always)]
    fn try_reserve(&mut self, n: usize) -> Result<(), super::buffer_backend::BackendOverflow> {
        // Fixed-capacity backend: linear `tail + n <= slice.len()`
        // check. Lets safe public decode APIs catch a malformed-frame
        // overshoot here instead of via the `assert!` inside
        // `extend_from_within_unchecked` further down the call chain.
        match self.tail.checked_add(n) {
            Some(new_tail) if new_tail <= self.slice.len() => Ok(()),
            _ => Err(super::buffer_backend::BackendOverflow {
                tail: self.tail,
                requested: n,
                capacity: self.slice.len(),
            }),
        }
    }

    #[inline]
    fn reserve(&mut self, _n: usize) {
        // No-op: capacity is fixed at construction (slice length).
        // The decoder's sequence-execution path issues
        // `buffer.reserve(MAX_BLOCK_SIZE)` upfront as a precaution
        // for FlatBuf's growable Vec; for UserSliceBackend we can
        // never satisfy that precaution because the slice can't
        // grow. The actual write-site debug_asserts in `extend` /
        // `extend_and_fill` / `extend_from_within_unchecked` /
        // `extend_from_reader` catch the real (much smaller)
        // capacity bound — `match_length` and per-block writes are
        // bounded by the well-formed-frame contract such that
        // `tail + write_size <= frame_content_size`.
    }

    #[inline]
    fn len(&self) -> usize {
        self.tail - self.head
    }

    #[inline]
    fn cap(&self) -> usize {
        self.slice.len()
    }

    #[inline]
    fn tail(&self) -> usize {
        self.tail
    }

    #[inline]
    unsafe fn set_tail(&mut self, new_tail: usize) {
        debug_assert!(new_tail >= self.head);
        debug_assert!(new_tail <= self.slice.len());
        self.tail = new_tail;
    }

    // `#[inline(always)]` because perf annotate on the primary bench
    // attributes ~10% of decode time to this method's own prologue /
    // epilogue (subq+pushq on entry, popq+retq on exit) — pure
    // function-call boundary cost on a method that is called from
    // tight literal-emit loops inside `decode_and_execute_sequences`.
    // The body is small (one assert, one copy), so forced inlining
    // does not bloat callers meaningfully. NOT a dispatch + match
    // pattern (the documented Tier-10 negative): the entry has no
    // runtime branch on kernel features — `copy_bytes_overshooting`
    // owns that dispatch internally.
    #[inline(always)]
    fn extend(&mut self, data: &[u8]) {
        let len = data.len();
        let new_tail = self.tail + len;
        // Release-mode capacity assert (mirrors
        // extend_from_within_unchecked). The body issues an unsafe
        // SIMD copy that takes `total_writable` as its
        // upper-bound contract; a malformed Compressed block whose
        // literals expand past the declared frame_content_size
        // would otherwise pass through `debug_assert!` in release
        // builds and turn the unchecked copy into UB. Cost: one
        // compare on the literal-push path — same magnitude as
        // the surrounding bounds-already-baked-in writes.
        //
        // This is the INFALLIBLE entry point. The safe public APIs
        // (`decode_all`, `decode_all_to_vec`) never reach it on
        // malformed input: their dispatch routes through
        // [`Self::try_extend`] (and via `DecodeBuffer::try_push` /
        // `try_reserve` for the match-repeat path), which return
        // `BackendOverflow` and convert into
        // `FrameDecoderError::ExecuteSequencesError` /
        // `FrameContentSizeMismatch`. The `assert!` here covers
        // the case where a future caller wires the infallible
        // entry point into a hot path that doesn't go through the
        // dispatch — the panic message points at the corrupt-frame
        // root cause rather than letting the subsequent unsafe
        // pointer math go OOB.
        assert!(
            new_tail <= self.slice.len(),
            "UserSliceBackend::extend overflows slice (tail+={}, cap={}) — corrupt frame",
            len,
            self.slice.len()
        );
        // Literal pushes (the sequence executor calls this for the
        // literal portion of every sequence) frequently land in the
        // 1..=16 byte range on highly-compressed corpora — exactly
        // the range where libc memmove dispatch overhead dominates
        // the cost of the actual copy. Route through
        // `copy_bytes_overshooting` so short pushes inline a single
        // SIMD / overlapping-u64 store instead.
        let total_writable = self.slice.len() - self.tail;
        // SAFETY: caller-provided `data` is non-aliasing with the
        // backend's slice (it's the literals buffer or input view).
        // `new_tail <= self.slice.len()` (release-mode `assert!`
        // above), so both regions have ≥ `len` valid bytes.
        unsafe {
            super::simd_copy::copy_bytes_overshooting(
                (data.as_ptr(), len),
                (self.slice.as_mut_ptr().add(self.tail), total_writable),
                len,
            );
        }
        self.tail = new_tail;
    }

    #[inline]
    fn extend_and_fill(&mut self, fill_with: u8, fill_length: usize) {
        let new_tail = self.tail + fill_length;
        // Release-mode `assert!` (not `debug_assert!`) for symmetry
        // with `extend` / `extend_from_within_unchecked`. Without it,
        // a malformed Compressed block whose RLE fill expands past
        // the declared `frame_content_size` would panic via the
        // subsequent `self.slice[self.tail..new_tail]` slice index
        // with a less-informative message, AND the in-block writes
        // would already have happened up to the slice length. Fail
        // fast with a clear corruption / capacity diagnostic.
        assert!(
            new_tail <= self.slice.len(),
            "UserSliceBackend::extend_and_fill overflows slice (tail+={}, cap={}) — corrupt frame",
            fill_length,
            self.slice.len()
        );
        // `slice::fill` lowers to a memset on byte slices; the
        // per-byte loop above the rebased commit replaces it with
        // an explicit assignment, which the optimiser does not
        // always promote back. For large RLE blocks the memset
        // path wins ~3-5x on x86_64.
        self.slice[self.tail..new_tail].fill(fill_with);
        self.tail = new_tail;
    }

    fn extend_from_reader<R: Read>(
        &mut self,
        mut read: R,
        fill_length: usize,
    ) -> Result<(), Error> {
        let old = self.tail;
        let new_tail = old + fill_length;
        if new_tail > self.slice.len() {
            return Err(Error::other(
                "UserSliceBackend: raw block exceeds caller-provided output capacity",
            ));
        }
        match read.read_exact(&mut self.slice[old..new_tail]) {
            Ok(()) => {
                self.tail = new_tail;
                Ok(())
            }
            // Don't advance `tail` on failure — the upper bound from
            // the slice borrow above guarantees the `read_exact`
            // attempt didn't write past `new_tail`, but we MUST keep
            // `tail` pointing at the last fully-decoded byte so
            // checkpoint rollback / drain semantics line up with
            // FlatBuf's truncate-on-error shape.
            Err(e) => Err(e),
        }
    }

    // Keep `#[inline]` (hint, not force). An earlier experiment with
    // `#[inline(always)]` regressed primary bench by +2.96% — body
    // is materially larger than `extend` (assert + readable/writable
    // derivation + simd_copy::copy_bytes_overshooting call), and
    // forced inlining bloats each pipeline-slot caller past icache
    // budget. The per-call boundary save is dwarfed by the
    // duplicated body weight; LLVM's heuristic was right to decline.
    #[inline]
    unsafe fn extend_from_within_unchecked(&mut self, start: usize, len: usize) {
        let dst_off = self.tail;
        let src_off = self.head + start;
        debug_assert!(src_off + len <= dst_off);
        // Release-mode capacity check: the trait contract says
        // capacity for `len` bytes past the tail was reserved by
        // the caller, but UserSliceBackend's reserve is a no-op
        // (slice can't grow), so a malformed frame with an out-of-
        // bounds match could otherwise turn the unchecked
        // SIMD copy into UB in release builds. FlatBuf relies on
        // `Vec::reserve`; we have no allocator to call so the
        // check has to be inline. Cost: one compare on a path
        // that's already memory-bound.
        assert!(
            dst_off + len <= self.slice.len(),
            "UserSliceBackend: match write past slice capacity (corrupt frame)"
        );
        // Route the match copy through `simd_copy::copy_bytes_overshooting`
        // rather than `ptr::copy_nonoverlapping`. The latter goes to
        // libc `__memmove_avx_unaligned_erms` on x86_64, which costs
        // ~10ns of dispatch overhead per call — measured at 40% of
        // decode CPU on the L-1 fast c_stream flamegraph because
        // L-1 produces thousands of short matches per frame.
        // `copy_bytes_overshooting` inlines a single SIMD
        // load+store for `len <= 16` with WILDCOPY_OVERLENGTH slack
        // (typical match case), and a byte / overlapping-u64
        // sequence for shorter copies without slack — both bypass
        // the libc memmove dispatch entirely.
        let total_readable = self.tail - src_off;
        let total_writable = self.slice.len() - dst_off;
        // SAFETY: caller's non-overlap precondition gives
        // `src_off + len <= dst_off` (so src/dst regions don't
        // overlap), `total_readable >= len` follows from
        // `src_off + len <= dst_off <= self.tail`, and
        // `total_writable >= len` by the assert above. The
        // helper writes ≤ `total_writable` bytes, so it stays
        // inside the slice even when it overshoots `len`.
        unsafe {
            let base = self.slice.as_mut_ptr();
            super::simd_copy::copy_bytes_overshooting(
                (base.add(src_off), total_readable),
                (base.add(dst_off), total_writable),
                len,
            );
        }
        self.tail = dst_off + len;
    }

    #[inline]
    unsafe fn extend_from_within_unchecked_branchless(&mut self, start: usize, len: usize) {
        // Direct-slice layout never wraps — same forward to the
        // single non-overlapping copy as FlatBuf.
        unsafe { self.extend_from_within_unchecked(start, len) }
    }

    #[inline]
    fn as_slices(&self) -> (&[u8], &[u8]) {
        (&self.slice[self.head..self.tail], &[])
    }

    #[inline]
    fn drop_first_n(&mut self, n: usize) {
        self.head += n;
        debug_assert!(self.head <= self.tail);
    }

    // ── Fallible write surface ──
    //
    // Override the default trait impls (which call panic-on-overflow
    // variants) with explicit capacity checks that return
    // `BackendOverflow` instead. This is the entire point of the
    // backend: a fixed-capacity output slice that cannot grow on
    // demand, so any overshoot must be reported instead of aborting.

    #[inline(always)]
    fn try_extend(&mut self, data: &[u8]) -> Result<(), super::buffer_backend::BackendOverflow> {
        let len = data.len();
        // Use `checked_add` to catch adversarial input where
        // `self.tail + len` would wrap `usize` — without the wrap
        // check, the subsequent `new_tail > self.slice.len()` test
        // can be bypassed by an `len` near `usize::MAX`, turning
        // the fallible write into a release-mode panic via
        // `copy_bytes_overshooting`.
        // BackendOverflow is `Copy` with three usize fields; eager
        // construction is cheaper than a closure indirection on this
        // hot path (clippy::unnecessary_lazy_evaluations).
        let new_tail =
            self.tail
                .checked_add(len)
                .ok_or(super::buffer_backend::BackendOverflow {
                    tail: self.tail,
                    requested: len,
                    capacity: self.slice.len(),
                })?;
        if new_tail > self.slice.len() {
            return Err(super::buffer_backend::BackendOverflow {
                tail: self.tail,
                requested: len,
                capacity: self.slice.len(),
            });
        }
        let total_writable = self.slice.len() - self.tail;
        // SAFETY: `new_tail <= self.slice.len()` (checked above);
        // `data` is non-aliasing with the backend's slice (caller
        // contract — literals buffer / input view).
        unsafe {
            super::simd_copy::copy_bytes_overshooting(
                (data.as_ptr(), len),
                (self.slice.as_mut_ptr().add(self.tail), total_writable),
                len,
            );
        }
        self.tail = new_tail;
        Ok(())
    }

    #[inline(always)]
    fn try_extend_and_fill(
        &mut self,
        fill_with: u8,
        fill_length: usize,
    ) -> Result<(), super::buffer_backend::BackendOverflow> {
        // Same wrap-check rationale as `try_extend` above — an
        // adversarial `fill_length` near `usize::MAX` would wrap
        // `new_tail`, bypass the upper bound, and panic in
        // `slice[tail..new_tail]` (start > end).
        let new_tail =
            self.tail
                .checked_add(fill_length)
                .ok_or(super::buffer_backend::BackendOverflow {
                    tail: self.tail,
                    requested: fill_length,
                    capacity: self.slice.len(),
                })?;
        if new_tail > self.slice.len() {
            return Err(super::buffer_backend::BackendOverflow {
                tail: self.tail,
                requested: fill_length,
                capacity: self.slice.len(),
            });
        }
        self.slice[self.tail..new_tail].fill(fill_with);
        self.tail = new_tail;
        Ok(())
    }

    #[inline(always)]
    fn try_extend_from_within(
        &mut self,
        start: usize,
        len: usize,
    ) -> Result<(), super::buffer_backend::BackendOverflow> {
        // Bound 1: source range fits in live region.
        // The caller's `start` is relative to the live-region head;
        // map to a physical absolute position and check against the
        // current tail. Both `head + start` and `+ len` get
        // `checked_add` so an adversarial `start` or `len` near
        // `usize::MAX` cannot wrap past the bounds checks.
        let abs_start =
            self.head
                .checked_add(start)
                .ok_or(super::buffer_backend::BackendOverflow {
                    tail: self.tail,
                    requested: len,
                    capacity: self.slice.len(),
                })?;
        let abs_end = abs_start
            .checked_add(len)
            .ok_or(super::buffer_backend::BackendOverflow {
                tail: self.tail,
                requested: len,
                capacity: self.slice.len(),
            })?;
        if abs_end > self.tail {
            return Err(super::buffer_backend::BackendOverflow {
                tail: self.tail,
                requested: len,
                capacity: self.slice.len(),
            });
        }
        // Bound 2: destination has capacity for `len`. Same wrap
        // protection — without it, an adversarial `len` near
        // `usize::MAX` wraps and bypasses the upper bound, turning
        // the unchecked write into a release-mode UB / panic.
        // BackendOverflow is `Copy` with three usize fields; eager
        // construction is cheaper than a closure indirection on this
        // hot path (clippy::unnecessary_lazy_evaluations).
        let new_tail =
            self.tail
                .checked_add(len)
                .ok_or(super::buffer_backend::BackendOverflow {
                    tail: self.tail,
                    requested: len,
                    capacity: self.slice.len(),
                })?;
        if new_tail > self.slice.len() {
            return Err(super::buffer_backend::BackendOverflow {
                tail: self.tail,
                requested: len,
                capacity: self.slice.len(),
            });
        }
        // SAFETY: both bounds checked above. Forward to the unsafe
        // variant which performs the wildcopy with the same
        // preconditions the bounds checks established.
        unsafe { self.extend_from_within_unchecked(start, len) };
        Ok(())
    }
}

// `WILDCOPY_OVERLENGTH` is used implicitly via the dispatcher's
// capacity sizing — kept imported here for the doc reference and to
// surface a build error if the constant moves.
const _: () = {
    let _: usize = WILDCOPY_OVERLENGTH;
};

#[cfg(test)]
mod tests {
    extern crate alloc;
    use super::*;
    use alloc::vec;
    #[cfg(target_arch = "x86_64")]
    use alloc::vec::Vec;

    #[test]
    fn extend_writes_at_tail() {
        let mut buf = vec![0u8; 32];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2, 3, 4]);
        assert_eq!(b.len(), 4);
        assert_eq!(b.tail(), 4);
        b.extend(&[5, 6]);
        let (s, t) = b.as_slices();
        assert_eq!(s, &[1, 2, 3, 4, 5, 6]);
        assert!(t.is_empty());
    }

    #[test]
    fn extend_and_fill_repeats_byte() {
        let mut buf = vec![0u8; 16];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[0xAA]);
        b.extend_and_fill(0xBB, 4);
        let (s, _) = b.as_slices();
        assert_eq!(s, &[0xAA, 0xBB, 0xBB, 0xBB, 0xBB]);
    }

    #[test]
    fn extend_from_within_unchecked_copies_non_overlapping() {
        let mut buf = vec![0u8; 32];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[10, 20, 30, 40, 50]);
        // SAFETY: 0+3 <= 5 = len; cap 32 covers 5+3.
        unsafe { b.extend_from_within_unchecked(0, 3) };
        let (s, _) = b.as_slices();
        assert_eq!(s, &[10, 20, 30, 40, 50, 10, 20, 30]);
    }

    #[test]
    fn drop_first_n_advances_head_keeps_history() {
        let mut buf = vec![0u8; 32];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2, 3, 4, 5]);
        b.drop_first_n(2);
        assert_eq!(b.len(), 3);
        let (s, _) = b.as_slices();
        assert_eq!(s, &[3, 4, 5]);
        // After drop, drained bytes remain physically present and can
        // back a match copy via `start` indexed from the post-drop head.
        unsafe { b.extend_from_within_unchecked(0, 3) };
        let (s, _) = b.as_slices();
        assert_eq!(s, &[3, 4, 5, 3, 4, 5]);
    }

    #[test]
    fn set_tail_rollback() {
        let mut buf = vec![0u8; 32];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2, 3]);
        let saved = b.tail();
        b.extend(&[4, 5, 6, 7]);
        assert_eq!(b.len(), 7);
        unsafe { b.set_tail(saved) };
        assert_eq!(b.len(), 3);
        let (s, _) = b.as_slices();
        assert_eq!(s, &[1, 2, 3]);
    }

    #[test]
    fn clear_resets_cursors() {
        let mut buf = vec![0u8; 32];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2, 3]);
        b.drop_first_n(1);
        b.clear();
        assert_eq!(b.len(), 0);
        assert_eq!(b.tail(), 0);
    }

    #[test]
    fn extend_from_reader_into_slice() {
        let mut buf = vec![0u8; 16];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        let src = [9u8, 8, 7, 6, 5];
        b.extend_from_reader(&src[..], 5).unwrap();
        let (s, _) = b.as_slices();
        assert_eq!(s, &[9, 8, 7, 6, 5]);
    }

    #[test]
    fn extend_from_reader_over_capacity_errors() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        let src = [9u8, 8, 7, 6, 5];
        // 5 bytes requested, only 4 cap -> error, tail unchanged.
        assert!(b.extend_from_reader(&src[..], 5).is_err());
        assert_eq!(b.tail(), 0);
    }

    // Coverage for the fallible try_* surface. Exercises:
    //   - happy paths (exact-fit + room to spare),
    //   - capacity-overflow paths (returns Err with diagnostic fields),
    //   - integer-overflow wrap-guards (checked_add ok_or branch).

    use super::super::buffer_backend::BufferBackend;

    #[test]
    fn try_extend_exact_fit_succeeds_and_advances_tail() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        assert!(b.try_extend(&[1, 2, 3, 4]).is_ok());
        assert_eq!(b.tail(), 4);
        let (s, _) = b.as_slices();
        assert_eq!(s, &[1, 2, 3, 4]);
    }

    #[test]
    fn try_extend_over_capacity_returns_overflow_and_keeps_tail() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        let err = b.try_extend(&[1, 2, 3, 4, 5]).unwrap_err();
        assert_eq!(err.tail, 0);
        assert_eq!(err.requested, 5);
        assert_eq!(err.capacity, 4);
        assert_eq!(b.tail(), 0);
    }

    #[test]
    fn try_extend_partially_full_overshoot_reports_current_tail() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2]);
        // 3 more bytes would need 5 total, capacity is 4.
        let err = b.try_extend(&[3, 4, 5]).unwrap_err();
        assert_eq!(err.tail, 2);
        assert_eq!(err.requested, 3);
        assert_eq!(err.capacity, 4);
        assert_eq!(b.tail(), 2);
    }

    #[test]
    fn try_extend_zero_length_succeeds_and_leaves_tail_unchanged() {
        // The `checked_add(tail, len)` wrap branch in `try_extend` is
        // a defense-in-depth guard for corrupted input that names
        // `regenerated_size` near `usize::MAX`. Constructing such a
        // `&[u8]` from safe Rust is not expressible — `from_raw_parts`
        // with a forged length is UB. The wrap branch is exercised
        // only from the fuzz harness under `feature = "fuzz_exports"`
        // (which routes a controlled `len` through `try_*`) and from
        // the real malformed-frame decode path that the harness
        // emulates.
        //
        // This test covers the adjacent normal case — a zero-length
        // `try_extend` MUST succeed regardless of current `tail`
        // (the new_tail = tail + 0 = tail comparison both passes
        // checked_add and the upper-bound check). Without this case
        // the early-return on `len == 0` could regress silently.
        let mut buf = vec![0u8; 8];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        assert!(b.try_extend(&[]).is_ok());
        assert_eq!(b.tail(), 0);
        b.extend(&[1, 2, 3]);
        assert!(b.try_extend(&[]).is_ok());
        assert_eq!(b.tail(), 3);
    }

    #[test]
    fn try_extend_and_fill_exact_fit_writes_pattern() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        assert!(b.try_extend_and_fill(0xAB, 4).is_ok());
        assert_eq!(b.tail(), 4);
        let (s, _) = b.as_slices();
        assert_eq!(s, &[0xAB, 0xAB, 0xAB, 0xAB]);
    }

    #[test]
    fn try_extend_and_fill_over_capacity_returns_overflow() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2]);
        let err = b.try_extend_and_fill(0xCD, 5).unwrap_err();
        assert_eq!(err.tail, 2);
        assert_eq!(err.requested, 5);
        assert_eq!(err.capacity, 4);
        assert_eq!(b.tail(), 2);
    }

    #[test]
    fn try_extend_from_within_within_bounds_repeats_history() {
        let mut buf = vec![0u8; 8];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2, 3]);
        // Repeat the first 3 bytes from history into the next 3 slots.
        assert!(b.try_extend_from_within(0, 3).is_ok());
        let (s, _) = b.as_slices();
        assert_eq!(s, &[1, 2, 3, 1, 2, 3]);
        assert_eq!(b.tail(), 6);
    }

    #[test]
    fn try_extend_from_within_source_past_tail_returns_overflow() {
        let mut buf = vec![0u8; 8];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2]);
        // start=0, len=5 — source range needs bytes 0..5 but tail=2.
        let err = b.try_extend_from_within(0, 5).unwrap_err();
        assert_eq!(err.tail, 2);
        assert_eq!(err.requested, 5);
        assert_eq!(b.tail(), 2);
    }

    #[test]
    fn try_extend_from_within_destination_overflow_returns_err() {
        let mut buf = vec![0u8; 4];
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.extend(&[1, 2, 3]);
        // Source 0..2 valid, but writing 2 more bytes would push tail
        // from 3 to 5, past capacity 4.
        let err = b.try_extend_from_within(0, 2).unwrap_err();
        assert_eq!(err.tail, 3);
        assert_eq!(err.requested, 2);
        assert_eq!(err.capacity, 4);
        assert_eq!(b.tail(), 3);
    }

    /// Direct tests for `exec_sequence_inline` — exercise the
    /// x86_64 inline body so coverage attributes its 40 lines to
    /// these tests, not through the deep `decode_all`
    /// pipeline where `cargo llvm-cov` sometimes loses the inlined
    /// callee. Tests cover: short-literal + short-match, long
    /// literal (wildcopy tail), short-offset match (overlapCopy8 +
    /// 8-byte stride), long-offset match (wildcopy_no_overlap).
    #[cfg(target_arch = "x86_64")]
    #[test]
    fn exec_sequence_inline_short_literal_plus_long_offset_match() {
        // Layout: pre-fill `tail = 8` with a "history" region so
        // a match copy at offset 16 reaches inside the slice. Then
        // bump tail past that history and exercise donor_exec.
        // Buffer sized with WILDCOPY_OVERLENGTH slack at the end.
        const WILDCOPY: usize = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
        let mut buf = vec![0u8; 256 + WILDCOPY];
        // Seed history: bytes 0..32 = ascending values, so a later
        // match at offset 16 picks up bytes 16..32.
        for (i, slot) in buf.iter_mut().take(32).enumerate() {
            *slot = i as u8;
        }
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.tail = 32; // Pretend 32 history bytes are already written.

        // 8-byte literals to write (donor's litLength <= 16 fast
        // path — no wildcopy tail). Match length 8 at offset 16.
        let lits: [u8; 16] = [
            0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22, 0xA1, 0xB1, 0xC1, 0xD1, 0xE1, 0xF1,
            0x10, 0x20,
        ];
        unsafe {
            b.exec_sequence_inline(lits.as_ptr(), 8, 16, 8).unwrap();
        }
        // tail advanced by 8 lit + 8 match = 16.
        assert_eq!(b.tail, 48);
        // Literals landed at tail = 32..40.
        assert_eq!(&buf[32..40], &lits[..8]);
        // Match: at output position 40..48, source = tail + lit_len -
        // offset = 32 + 8 - 16 = 24. So buf[40..48] == buf[24..32]
        // (history bytes 24..32 = [24, 25, 26, 27, 28, 29, 30, 31]).
        assert_eq!(&buf[40..48], &[24u8, 25, 26, 27, 28, 29, 30, 31]);
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn exec_sequence_inline_long_literal_uses_wildcopy_tail() {
        // litLength > 16 path: unconditional copy16 + wildcopy tail
        // for the remaining literal bytes.
        const WILDCOPY: usize = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
        let mut buf = vec![0u8; 256 + WILDCOPY];
        for (i, slot) in buf.iter_mut().take(32).enumerate() {
            *slot = i as u8;
        }
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.tail = 32;

        // 40-byte literals (forces wildcopy tail) — needs 40-byte
        // source buffer with extra read slack for the final
        // 16-byte load.
        let lits: Vec<u8> = (0..40 + 16).map(|i| 0x80 + i as u8).collect();
        unsafe {
            b.exec_sequence_inline(lits.as_ptr(), 40, 16, 8).unwrap();
        }
        assert_eq!(b.tail, 80);
        assert_eq!(&buf[32..72], &lits[..40]);
        // Match at offset 16: src = 32 + 40 - 16 = 56. buf[72..80]
        // == buf[56..64] (which we just wrote = lits[24..32]).
        assert_eq!(&buf[72..80], &lits[24..32]);
    }

    #[cfg(target_arch = "x86_64")]
    #[test]
    fn exec_sequence_inline_short_offset_match_uses_overlap_copy() {
        // offset < 16 takes the overlapCopy8 + 8-byte stride path
        // (vs. the offset >= 16 wildcopy_no_overlap path).
        //
        // What we actually assert here:
        //   1. `tail` advances by `lit_length + match_length`.
        //   2. The literal payload lands at `buf[tail..tail+ll]`.
        //   3. The FIRST 4 match-output bytes match seed[4..8] —
        //      that prefix of the match copy reads source bytes
        //      that the literal `copy16` overshoot did NOT
        //      overwrite (the donor `copy16` writes 16 bytes at
        //      `tail`, so source bytes BEFORE `tail` survive).
        //
        // We do NOT cross-validate against the legacy `extend` +
        // `repeat_in_chunks` chain: those paths don't perform the
        // 16-byte literal overshoot, so they produce a different
        // output for the same logical sequence (different bytes in
        // positions where the donor's overshoot is consumed by the
        // match copy). End-to-end parity is covered by the higher
        // level `roundtrip_integrity::*` tests in lib.rs which
        // decode whole frames and compare to the encoder input.
        const WILDCOPY: usize = super::super::buffer_backend::WILDCOPY_OVERLENGTH;
        let mut buf = vec![0u8; 256 + WILDCOPY];
        // Seed last 8 bytes of history with a recognisable pattern.
        let seed = [0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7];
        buf[24..32].copy_from_slice(&seed);
        let mut b = UserSliceBackend::from_slice(&mut buf);
        b.tail = 32;

        let lits: [u8; 16] = [0xFF; 16];
        // litLength=4, offset=8, matchLength=12. offset<16 → short
        // path (overlapCopy8 + 8-byte stride).
        unsafe {
            b.exec_sequence_inline(lits.as_ptr(), 4, 8, 12).unwrap();
        }
        // tail = 32 + 4 + 12 = 48.
        assert_eq!(b.tail, 48);
        // Literal copy: bytes 32..36 are the literal payload.
        assert_eq!(&buf[32..36], &lits[..4]);
        // Match copy: the first 4 output bytes (positions 36..40)
        // are the FIRST 4 source bytes (positions 28..32), which the
        // literal copy16 has NOT overwritten (it wrote at 32..48, so
        // 28..32 remain as the seed tail). Verify those.
        assert_eq!(&buf[36..40], &seed[4..8]);
        // The remaining 8 match bytes (40..48) get fed by the
        // 8-byte-stride wildcopy reading from positions inside the
        // match-destination region, which the literal copy16 already
        // overwrote with 0xFF. That's the donor invariant — the
        // overshoot is consumed correctly. We don't pin the exact
        // bytes (they're a function of overlap_copy8's spread
        // tables) but the output length must be right.
    }
}